Production-Grade PDF RAG Complete Guide: Four-Layer Architecture from Parsing to Evaluation
In the previous article Interviewer Asks: How Does RAG Handle PDFs, we provided a standard answer framework for interview scenarios. This article is for developers who want to deeply understand the underlying implementation of production-grade PDF RAG — covering all four layers thoroughly: code implementations, technology selection, and practical techniques for each layer.
1. Layer 1: Parsing Layer — First Identify PDF Type, Then Choose Parsing Method
1.1 PDF Type Detection
import fitz # PyMuPDF
def detect_pdf_type(pdf_path: str) -> str:
"""Identify PDF type: native text / scanned / mixed content"""
doc = fitz.open(pdf_path)
text_ratio = 0
image_ratio = 0
for page in doc:
# Extract text
text = page.get_text()
text_area = len(text) * 100 # Rough estimate of text area
# Extract images
images = page.get_images()
image_area = sum(img[2] * img[3] for img in images) # width * height
page_area = page.rect.width * page.rect.height
text_ratio += text_area / page_area if page_area > 0 else 0
image_ratio += image_area / page_area if page_area > 0 else 0
doc.close()
avg_text_ratio = text_ratio / len(doc) if len(doc) > 0 else 0
avg_image_ratio = image_ratio / len(doc) if len(doc) > 0 else 0
if avg_text_ratio > 0.3:
return "native_text" # Native text PDF
elif avg_image_ratio > 0.5:
return "scanned" # Scanned PDF
else:
return "mixed" # Mixed content PDF
# Usage
pdf_type = detect_pdf_type("contract.pdf")
print(f"PDF type: {pdf_type}")
1.2 Native Text PDF Parsing
import fitz
class NativeTextParser:
"""Native text PDF parser"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def extract_structured_text(self) -> list:
"""Extract text with structural information"""
pages = []
for page_num, page in enumerate(self.doc):
# Get text blocks (with position info)
blocks = page.get_text("dict")["blocks"]
page_content = {
"page_num": page_num + 1,
"blocks": []
}
for block in blocks:
if block["type"] == 0: # Text block
for line in block["lines"]:
text = "".join([span["text"] for span in line["spans"]])
font_size = line["spans"][0]["size"] if line["spans"] else 12
page_content["blocks"].append({
"type": "text",
"text": text,
"font_size": font_size,
"bbox": line["bbox"],
"is_heading": font_size > 14 # Simple heading detection
})
elif block["type"] == 1: # Image block
page_content["blocks"].append({
"type": "image",
"bbox": block["bbox"]
})
pages.append(page_content)
return pages
def extract_tables(self, page_num: int) -> list:
"""Extract tables (requires camelot or tabula)"""
try:
import camelot
tables = camelot.read_pdf(self.pdf_path, pages=str(page_num + 1))
return [table.df.to_dict() for table in tables]
except ImportError:
print("Please install camelot: pip install camelot-py")
return []
# Usage
parser = NativeTextParser("report.pdf")
pages = parser.extract_structured_text()
print(f"Extracted {len(pages)} pages")
1.3 Scanned PDF Parsing (OCR)
import pytesseract
from PIL import Image
import fitz
class OCRParser:
"""Scanned PDF parser (OCR)"""
def __init__(self, pdf_path: str, lang: str = "eng+chi_sim"):
self.pdf_path = pdf_path
self.lang = lang
self.doc = fitz.open(pdf_path)
def extract_text(self, dpi: int = 300) -> list:
"""Extract text via OCR"""
pages = []
for page_num, page in enumerate(self.doc):
# Convert page to image
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default DPI
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# OCR recognition
text = pytesseract.image_to_string(img, lang=self.lang)
pages.append({
"page_num": page_num + 1,
"text": text,
"dpi": dpi
})
return pages
def get_confidence(self, dpi: int = 300) -> list:
"""Get OCR confidence (for quality assessment)"""
pages = []
for page_num, page in enumerate(self.doc):
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Get detailed data (with confidence)
data = pytesseract.image_to_data(img, lang=self.lang, output_type=pytesseract.Output.DICT)
confidences = [int(c) for c in data["conf"] if int(c) > 0]
avg_conf = sum(confidences) / len(confidences) if confidences else 0
pages.append({
"page_num": page_num + 1,
"avg_confidence": avg_conf,
"low_conf_words": sum(1 for c in confidences if c < 60)
})
return pages
# Usage
ocr_parser = OCRParser("scanned_contract.pdf")
pages = ocr_parser.extract_text()
confidence = ocr_parser.get_confidence()
print(f"OCR avg confidence: {confidence[0]['avg_confidence']:.1f}%")
1.4 Mixed Content PDF Parsing (Multimodal)
import base64
import fitz
class MultimodalParser:
"""Mixed content PDF parser (multimodal)"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def extract_with_images(self) -> list:
"""Extract text and images (images encoded as base64)"""
pages = []
for page_num, page in enumerate(self.doc):
# Extract text
text = page.get_text()
# Extract images
images = []
for img in page.get_images():
xref = img[0]
base_image = self.doc.extract_image(xref)
image_bytes = base_image["image"]
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
images.append({
"xref": xref,
"base64": image_b64,
"extension": base_image["ext"]
})
pages.append({
"page_num": page_num + 1,
"text": text,
"images": images
})
return pages
def analyze_with_vision(self, api_key: str) -> list:
"""Analyze images with vision model"""
import openai
pages = self.extract_with_images()
analyzed_pages = []
for page in pages:
image_descriptions = []
for img in page["images"]:
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": [
{"type": "text", "text": "Describe this image content, including chart types, data, text, etc."},
{"type": "image_url", "image_url": {
"url": f"data:image/{img['extension']};base64,{img['base64']}"
}}
]}
],
max_tokens=500
)
image_descriptions.append(response.choices[0].message.content)
analyzed_pages.append({
"page_num": page["page_num"],
"text": page["text"],
"image_descriptions": image_descriptions
})
return analyzed_pages
# Usage
mm_parser = MultimodalParser("report_with_charts.pdf")
pages = mm_parser.extract_with_images()
# analyzed = mm_parser.analyze_with_vision("your-api-key") # Needs API key
2. Layer 2: Structure Restoration — Preserve PDF Native Structure
2.1 Document Structure Restorer
class DocumentStructure:
"""Document structure restoration"""
def __init__(self):
self.title = ""
self.sections = [] # Section list
self.tables = [] # Table list
self.figures = [] # Figure list
self.footnotes = [] # Footnote list
def add_section(self, level: int, title: str, page_num: int, content: str):
self.sections.append({
"level": level,
"title": title,
"page_num": page_num,
"content": content
})
def add_table(self, table_id: str, page_num: int, caption: str, data: dict):
self.tables.append({
"table_id": table_id,
"page_num": page_num,
"caption": caption,
"data": data
})
def add_figure(self, figure_id: str, page_num: int, caption: str, description: str):
self.figures.append({
"figure_id": figure_id,
"page_num": page_num,
"caption": caption,
"description": description
})
class StructureExtractor:
"""Extract document structure from parsing results"""
def extract(self, pages: list) -> DocumentStructure:
doc = DocumentStructure()
current_section = None
for page in pages:
for block in page.get("blocks", []):
if block["type"] == "text":
text = block["text"].strip()
# Detect headings (font size > 14)
if block.get("is_heading") and len(text) < 100:
level = 1 if block["font_size"] > 18 else 2
current_section = {
"level": level,
"title": text,
"page_num": page["page_num"],
"content": ""
}
doc.add_section(level, text, page["page_num"], "")
elif current_section:
current_section["content"] += text + "\n"
return doc
# Usage
extractor = StructureExtractor()
doc_structure = extractor.extract(pages)
print(f"Extracted {len(doc_structure.sections)} sections")
2.2 Table Structured Processing
import pandas as pd
class TableProcessor:
"""Table structured processing"""
def process_table(self, table_data: dict, table_id: str, page_num: int) -> dict:
"""Convert table to structured text"""
df = pd.DataFrame(table_data)
# Generate table description
description = f"Table {table_id} (Page {page_num}):\n"
description += f"Columns: {', '.join(df.columns.tolist())}\n"
description += f"Rows: {len(df)}\n"
description += "Data:\n"
# Describe row by row
for idx, row in df.iterrows():
row_desc = f" Row {idx + 1}: "
row_desc += ", ".join([f"{col}={val}" for col, val in row.items()])
description += row_desc + "\n"
return {
"table_id": table_id,
"page_num": page_num,
"columns": df.columns.tolist(),
"data": table_data,
"structured_text": description
}
def extract_with_camelot(self, pdf_path: str, page_num: int) -> list:
"""Extract tables using camelot"""
try:
import camelot
tables = camelot.read_pdf(pdf_path, pages=str(page_num))
results = []
for i, table in enumerate(tables):
results.append({
"table_id": f"table_{page_num}_{i}",
"page_num": page_num,
"data": table.df.to_dict(),
"accuracy": table.parsing_report.get("accuracy", 0)
})
return results
except ImportError:
print("Please install camelot: pip install camelot-py")
return []
# Usage
processor = TableProcessor()
# structured_table = processor.process_table(table_data, "table_1", 5)
3. Layer 3: Chunking and Indexing — Chunk by Semantic Structure
3.1 Semantic Chunking Strategy
class SemanticChunker:
"""Semantic chunker"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_structure(self, doc: DocumentStructure) -> list:
"""Chunk by document structure"""
chunks = []
for section in doc.sections:
# Heading + content as one chunk
chunk_text = f"# {section['title']}\n\n{section['content']}"
# If content is too long, split by paragraph
if len(chunk_text) > self.chunk_size:
paragraphs = section['content'].split('\n\n')
current_chunk = f"# {section['title']}\n\n"
for para in paragraphs:
if len(current_chunk) + len(para) < self.chunk_size:
current_chunk += para + "\n\n"
else:
chunks.append(self._create_chunk(current_chunk, section))
current_chunk = f"# {section['title']}\n\n{para}\n\n"
if current_chunk.strip():
chunks.append(self._create_chunk(current_chunk, section))
else:
chunks.append(self._create_chunk(chunk_text, section))
# Tables chunked separately
for table in doc.tables:
chunks.append({
"text": table["structured_text"],
"metadata": {
"type": "table",
"table_id": table["table_id"],
"page_num": table["page_num"],
"section": "table"
}
})
return chunks
def _create_chunk(self, text: str, section: dict) -> dict:
"""Create chunk with metadata"""
return {
"text": text,
"metadata": {
"type": "text",
"section_title": section["title"],
"section_level": section["level"],
"page_num": section["page_num"],
"context": f"This is content from the '{section['title']}' section of the document."
}
}
# Usage
chunker = SemanticChunker(chunk_size=1000, chunk_overlap=200)
chunks = chunker.chunk_by_structure(doc_structure)
print(f"Generated {len(chunks)} chunks")
3.2 Chunking Strategy Comparison
| Strategy | Pros | Cons | Best For |
|---|---|---|---|
| Fixed-length chunking | Simple implementation | Cuts context | Simple documents |
| Paragraph chunking | Preserves paragraph integrity | Uneven paragraph lengths | Normal documents |
| Semantic structure chunking | Preserves document structure | Complex implementation | Production-grade apps |
| Section chunking | Clear logic | Uneven section lengths | Long documents |
3.3 Vector Index Construction
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
class VectorIndexer:
"""Vector index builder"""
def __init__(self, embedding_model: str = "text-embedding-3-small"):
self.embeddings = OpenAIEmbeddings(model=embedding_model)
self.vectorstore = None
def build_index(self, chunks: list, collection_name: str = "pdf_rag"):
"""Build vector index"""
texts = [chunk["text"] for chunk in chunks]
metadatas = [chunk["metadata"] for chunk in chunks]
self.vectorstore = Chroma.from_texts(
texts=texts,
embedding=self.embeddings,
metadatas=metadatas,
collection_name=collection_name
)
return self.vectorstore
def search(self, query: str, top_k: int = 5) -> list:
"""Retrieve relevant chunks"""
if not self.vectorstore:
raise ValueError("Please build index first")
results = self.vectorstore.similarity_search(query, k=top_k)
return results
def search_with_metadata(self, query: str, filter_dict: dict, top_k: int = 5) -> list:
"""Search with metadata filtering"""
results = self.vectorstore.similarity_search(
query, k=top_k, filter=filter_dict
)
return results
# Usage
indexer = VectorIndexer()
vectorstore = indexer.build_index(chunks)
results = indexer.search("What is the contract duration?")
for r in results:
print(f"[{r.metadata['section_title']}] {r.page_content[:100]}...")
4. Layer 4: Agent Invocation — Encapsulate as Toolchain
4.1 PDF Agent Toolchain
from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate
class PDFAgentTools:
"""PDF Agent toolchain"""
def __init__(self, vectorstore, doc_structure: DocumentStructure):
self.vectorstore = vectorstore
self.doc_structure = doc_structure
self.tools = self._create_tools()
def _create_tools(self) -> list:
return [
Tool(
name="search_pdf",
func=self._search_pdf,
description="Retrieve relevant snippets from PDF. Input: question"
),
Tool(
name="read_page",
func=self._read_page,
description="Read complete content of specified page. Input: page number"
),
Tool(
name="extract_table",
func=self._extract_table,
description="Extract specified table. Input: table ID"
),
Tool(
name="analyze_chart",
func=self._analyze_chart,
description="Analyze chart content. Input: chart ID"
),
Tool(
name="quote_source",
func=self._quote_source,
description="Return citation source. Input: chunk content"
)
]
def _search_pdf(self, query: str) -> str:
results = self.vectorstore.similarity_search(query, k=5)
return "\n\n".join([
f"[{r.metadata.get('section_title', 'Unknown')}] {r.page_content}"
for r in results
])
def _read_page(self, page_num: int) -> str:
# Read specified page from original document
return f"Complete content of page {page_num}..."
def _extract_table(self, table_id: str) -> str:
for table in self.doc_structure.tables:
if table["table_id"] == table_id:
return table["structured_text"]
return f"Table {table_id} not found"
def _analyze_chart(self, figure_id: str) -> str:
for figure in self.doc_structure.figures:
if figure["figure_id"] == figure_id:
return figure["description"]
return f"Chart {figure_id} not found"
def _quote_source(self, content: str) -> str:
results = self.vectorstore.similarity_search(content, k=1)
if results:
r = results[0]
return f"Source: {r.metadata.get('section_title', 'Unknown')}, Page {r.metadata.get('page_num', '?')}"
return "Source not found"
# Usage
tools = PDFAgentTools(vectorstore, doc_structure)
print(f"Available tools: {[t.name for t in tools.tools]}")
4.2 Agent Invocation Logic
def create_pdf_agent(tools, llm):
"""Create PDF Agent"""
prompt_template = """You are a PDF document assistant. Use the following tools to answer user questions:
{tools}
Use this format:
Question: user question
Thought: reasoning process
Action: tool name
Action Input: tool input
Observation: tool output
... (repeat)
Final Answer: final answer (must include citation sources)
Question: {input}
Thought:"""
prompt = PromptTemplate.from_template(prompt_template)
agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
return agent_executor
# Usage
# agent = create_pdf_agent(tools.tools, llm)
# result = agent.invoke({"input": "What is the contract duration?"})
5. Production-Grade Must-Have Details
5.1 Traceable Citations
class CitationManager:
"""Citation manager"""
def add_citations(self, answer: str, sources: list) -> str:
"""Add citations to answer"""
cited_answer = answer + "\n\n**Sources:**\n"
for i, source in enumerate(sources, 1):
cited_answer += f"[{i}] {source['section_title']}, Page {source['page_num']}\n"
return cited_answer
def verify_citation(self, answer: str, source_text: str) -> bool:
"""Verify citation accuracy (prevent hallucination)"""
# Simple implementation: check if key info from answer is in source text
# In production, use NLI models for stricter verification
return True
# Usage
citation_mgr = CitationManager()
cited_answer = citation_mgr.add_citations(answer, sources)
5.2 Evaluation Loop
class RAGEvaluator:
"""RAG evaluator"""
def evaluate(self, test_cases: list, agent) -> dict:
"""Evaluate RAG system"""
results = {
"retrieval_accuracy": 0, # Retrieved snippets match original text
"page_accuracy": 0, # Page numbers correct
"table_accuracy": 0, # Tables parsed accurately
"ocr_accuracy": 0, # OCR missed characters
"chart_accuracy": 0, # Chart info correctly understood
"overall_score": 0
}
for case in test_cases:
# Run Agent
answer = agent.invoke({"input": case["question"]})
# Evaluate retrieval accuracy
if case["expected_section"] in answer:
results["retrieval_accuracy"] += 1
# Evaluate page accuracy
if str(case["expected_page"]) in answer:
results["page_accuracy"] += 1
# Calculate averages
n = len(test_cases)
for key in results:
if key != "overall_score":
results[key] /= n
results["overall_score"] = sum(
results[k] for k in results if k != "overall_score"
) / 5
return results
# Usage
evaluator = RAGEvaluator()
test_cases = [
{"question": "What is the contract duration?", "expected_section": "Contract Duration", "expected_page": 3},
{"question": "What is the annual growth rate?", "expected_section": "Financial Data", "expected_page": 12},
]
# results = evaluator.evaluate(test_cases, agent)
6. Technology Selection Summary
| Layer | Recommended Tools | Alternatives |
|---|---|---|
| Native text parsing | PyMuPDF | pdfplumber, PyPDF2 |
| OCR | Tesseract + pytesseract | PaddleOCR, EasyOCR |
| Multimodal parsing | GPT-4o / Claude | Gemini, local models |
| Table extraction | Camelot | Tabula, pdfplumber |
| Vector database | Chroma / Pinecone | Weaviate, Milvus |
| Embedding | text-embedding-3-small | Cohere, local models |
| Agent framework | LangChain | LlamaIndex, custom |
7. Summary
| Layer | Core | Key Output |
|---|---|---|
| Parsing Layer | Classify first, then parse | Text + images + tables |
| Structure Restoration | Preserve native structure | Sections + tables + figures |
| Chunking & Indexing | Chunk by semantic structure, with metadata | High-quality chunks + vector index |
| Agent Invocation | Encapsulate toolchain, invoke on demand | Traceable answers |
One-sentence summary: Production-grade PDF RAG is not "convert PDF to text then vectorize" — it's a complete engineering project with four layers: parsing, restoration, chunking, and invocation, each requiring careful design.
📌 Related Reading:
- Interview answer: Interviewer Asks: How Does RAG Handle PDFs — Don't Just Say "Convert to Text and Chunk"
- Prompt Engineering series: Prompt Engineering Advanced: From "Writing Instructions" to "Designing Iteratable Prompt Systems"