Production-Grade PDF RAG Complete Guide: Four-Layer Architecture from Parsing to Evaluation

Production-Grade PDF RAG Complete Guide: Four-Layer Architecture from Parsing to Evaluation

In the previous article Interviewer Asks: How Does RAG Handle PDFs, we provided a standard answer framework for interview scenarios. This article is for developers who want to deeply understand the underlying implementation of production-grade PDF RAG — covering all four layers thoroughly: code implementations, technology selection, and practical techniques for each layer.

1. Layer 1: Parsing Layer — First Identify PDF Type, Then Choose Parsing Method

1.1 PDF Type Detection

import fitz  # PyMuPDF

def detect_pdf_type(pdf_path: str) -> str:
    """Identify PDF type: native text / scanned / mixed content"""
    doc = fitz.open(pdf_path)
    
    text_ratio = 0
    image_ratio = 0
    
    for page in doc:
        # Extract text
        text = page.get_text()
        text_area = len(text) * 100  # Rough estimate of text area
        
        # Extract images
        images = page.get_images()
        image_area = sum(img[2] * img[3] for img in images)  # width * height
        
        page_area = page.rect.width * page.rect.height
        text_ratio += text_area / page_area if page_area > 0 else 0
        image_ratio += image_area / page_area if page_area > 0 else 0
    
    doc.close()
    
    avg_text_ratio = text_ratio / len(doc) if len(doc) > 0 else 0
    avg_image_ratio = image_ratio / len(doc) if len(doc) > 0 else 0
    
    if avg_text_ratio > 0.3:
        return "native_text"  # Native text PDF
    elif avg_image_ratio > 0.5:
        return "scanned"      # Scanned PDF
    else:
        return "mixed"        # Mixed content PDF

# Usage
pdf_type = detect_pdf_type("contract.pdf")
print(f"PDF type: {pdf_type}")

1.2 Native Text PDF Parsing

import fitz

class NativeTextParser:
    """Native text PDF parser"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
    
    def extract_structured_text(self) -> list:
        """Extract text with structural information"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # Get text blocks (with position info)
            blocks = page.get_text("dict")["blocks"]
            
            page_content = {
                "page_num": page_num + 1,
                "blocks": []
            }
            
            for block in blocks:
                if block["type"] == 0:  # Text block
                    for line in block["lines"]:
                        text = "".join([span["text"] for span in line["spans"]])
                        font_size = line["spans"][0]["size"] if line["spans"] else 12
                        
                        page_content["blocks"].append({
                            "type": "text",
                            "text": text,
                            "font_size": font_size,
                            "bbox": line["bbox"],
                            "is_heading": font_size > 14  # Simple heading detection
                        })
                elif block["type"] == 1:  # Image block
                    page_content["blocks"].append({
                        "type": "image",
                        "bbox": block["bbox"]
                    })
            
            pages.append(page_content)
        
        return pages
    
    def extract_tables(self, page_num: int) -> list:
        """Extract tables (requires camelot or tabula)"""
        try:
            import camelot
            tables = camelot.read_pdf(self.pdf_path, pages=str(page_num + 1))
            return [table.df.to_dict() for table in tables]
        except ImportError:
            print("Please install camelot: pip install camelot-py")
            return []

# Usage
parser = NativeTextParser("report.pdf")
pages = parser.extract_structured_text()
print(f"Extracted {len(pages)} pages")

1.3 Scanned PDF Parsing (OCR)

import pytesseract
from PIL import Image
import fitz

class OCRParser:
    """Scanned PDF parser (OCR)"""
    
    def __init__(self, pdf_path: str, lang: str = "eng+chi_sim"):
        self.pdf_path = pdf_path
        self.lang = lang
        self.doc = fitz.open(pdf_path)
    
    def extract_text(self, dpi: int = 300) -> list:
        """Extract text via OCR"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # Convert page to image
            mat = fitz.Matrix(dpi / 72, dpi / 72)  # 72 is default DPI
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # OCR recognition
            text = pytesseract.image_to_string(img, lang=self.lang)
            
            pages.append({
                "page_num": page_num + 1,
                "text": text,
                "dpi": dpi
            })
        
        return pages
    
    def get_confidence(self, dpi: int = 300) -> list:
        """Get OCR confidence (for quality assessment)"""
        pages = []
        for page_num, page in enumerate(self.doc):
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Get detailed data (with confidence)
            data = pytesseract.image_to_data(img, lang=self.lang, output_type=pytesseract.Output.DICT)
            confidences = [int(c) for c in data["conf"] if int(c) > 0]
            
            avg_conf = sum(confidences) / len(confidences) if confidences else 0
            pages.append({
                "page_num": page_num + 1,
                "avg_confidence": avg_conf,
                "low_conf_words": sum(1 for c in confidences if c < 60)
            })
        
        return pages

# Usage
ocr_parser = OCRParser("scanned_contract.pdf")
pages = ocr_parser.extract_text()
confidence = ocr_parser.get_confidence()
print(f"OCR avg confidence: {confidence[0]['avg_confidence']:.1f}%")

1.4 Mixed Content PDF Parsing (Multimodal)

import base64
import fitz

class MultimodalParser:
    """Mixed content PDF parser (multimodal)"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
    
    def extract_with_images(self) -> list:
        """Extract text and images (images encoded as base64)"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # Extract text
            text = page.get_text()
            
            # Extract images
            images = []
            for img in page.get_images():
                xref = img[0]
                base_image = self.doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_b64 = base64.b64encode(image_bytes).decode("utf-8")
                images.append({
                    "xref": xref,
                    "base64": image_b64,
                    "extension": base_image["ext"]
                })
            
            pages.append({
                "page_num": page_num + 1,
                "text": text,
                "images": images
            })
        
        return pages
    
    def analyze_with_vision(self, api_key: str) -> list:
        """Analyze images with vision model"""
        import openai
        
        pages = self.extract_with_images()
        analyzed_pages = []
        
        for page in pages:
            image_descriptions = []
            for img in page["images"]:
                response = openai.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "user", "content": [
                            {"type": "text", "text": "Describe this image content, including chart types, data, text, etc."},
                            {"type": "image_url", "image_url": {
                                "url": f"data:image/{img['extension']};base64,{img['base64']}"
                            }}
                        ]}
                    ],
                    max_tokens=500
                )
                image_descriptions.append(response.choices[0].message.content)
            
            analyzed_pages.append({
                "page_num": page["page_num"],
                "text": page["text"],
                "image_descriptions": image_descriptions
            })
        
        return analyzed_pages

# Usage
mm_parser = MultimodalParser("report_with_charts.pdf")
pages = mm_parser.extract_with_images()
# analyzed = mm_parser.analyze_with_vision("your-api-key")  # Needs API key

2. Layer 2: Structure Restoration — Preserve PDF Native Structure

2.1 Document Structure Restorer

class DocumentStructure:
    """Document structure restoration"""
    
    def __init__(self):
        self.title = ""
        self.sections = []  # Section list
        self.tables = []    # Table list
        self.figures = []   # Figure list
        self.footnotes = [] # Footnote list
    
    def add_section(self, level: int, title: str, page_num: int, content: str):
        self.sections.append({
            "level": level,
            "title": title,
            "page_num": page_num,
            "content": content
        })
    
    def add_table(self, table_id: str, page_num: int, caption: str, data: dict):
        self.tables.append({
            "table_id": table_id,
            "page_num": page_num,
            "caption": caption,
            "data": data
        })
    
    def add_figure(self, figure_id: str, page_num: int, caption: str, description: str):
        self.figures.append({
            "figure_id": figure_id,
            "page_num": page_num,
            "caption": caption,
            "description": description
        })

class StructureExtractor:
    """Extract document structure from parsing results"""
    
    def extract(self, pages: list) -> DocumentStructure:
        doc = DocumentStructure()
        current_section = None
        
        for page in pages:
            for block in page.get("blocks", []):
                if block["type"] == "text":
                    text = block["text"].strip()
                    
                    # Detect headings (font size > 14)
                    if block.get("is_heading") and len(text) < 100:
                        level = 1 if block["font_size"] > 18 else 2
                        current_section = {
                            "level": level,
                            "title": text,
                            "page_num": page["page_num"],
                            "content": ""
                        }
                        doc.add_section(level, text, page["page_num"], "")
                    elif current_section:
                        current_section["content"] += text + "\n"
        
        return doc

# Usage
extractor = StructureExtractor()
doc_structure = extractor.extract(pages)
print(f"Extracted {len(doc_structure.sections)} sections")

2.2 Table Structured Processing

import pandas as pd

class TableProcessor:
    """Table structured processing"""
    
    def process_table(self, table_data: dict, table_id: str, page_num: int) -> dict:
        """Convert table to structured text"""
        df = pd.DataFrame(table_data)
        
        # Generate table description
        description = f"Table {table_id} (Page {page_num}):\n"
        description += f"Columns: {', '.join(df.columns.tolist())}\n"
        description += f"Rows: {len(df)}\n"
        description += "Data:\n"
        
        # Describe row by row
        for idx, row in df.iterrows():
            row_desc = f"  Row {idx + 1}: "
            row_desc += ", ".join([f"{col}={val}" for col, val in row.items()])
            description += row_desc + "\n"
        
        return {
            "table_id": table_id,
            "page_num": page_num,
            "columns": df.columns.tolist(),
            "data": table_data,
            "structured_text": description
        }
    
    def extract_with_camelot(self, pdf_path: str, page_num: int) -> list:
        """Extract tables using camelot"""
        try:
            import camelot
            tables = camelot.read_pdf(pdf_path, pages=str(page_num))
            results = []
            for i, table in enumerate(tables):
                results.append({
                    "table_id": f"table_{page_num}_{i}",
                    "page_num": page_num,
                    "data": table.df.to_dict(),
                    "accuracy": table.parsing_report.get("accuracy", 0)
                })
            return results
        except ImportError:
            print("Please install camelot: pip install camelot-py")
            return []

# Usage
processor = TableProcessor()
# structured_table = processor.process_table(table_data, "table_1", 5)

3. Layer 3: Chunking and Indexing — Chunk by Semantic Structure

3.1 Semantic Chunking Strategy

class SemanticChunker:
    """Semantic chunker"""
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def chunk_by_structure(self, doc: DocumentStructure) -> list:
        """Chunk by document structure"""
        chunks = []
        
        for section in doc.sections:
            # Heading + content as one chunk
            chunk_text = f"# {section['title']}\n\n{section['content']}"
            
            # If content is too long, split by paragraph
            if len(chunk_text) > self.chunk_size:
                paragraphs = section['content'].split('\n\n')
                current_chunk = f"# {section['title']}\n\n"
                
                for para in paragraphs:
                    if len(current_chunk) + len(para) < self.chunk_size:
                        current_chunk += para + "\n\n"
                    else:
                        chunks.append(self._create_chunk(current_chunk, section))
                        current_chunk = f"# {section['title']}\n\n{para}\n\n"
                
                if current_chunk.strip():
                    chunks.append(self._create_chunk(current_chunk, section))
            else:
                chunks.append(self._create_chunk(chunk_text, section))
        
        # Tables chunked separately
        for table in doc.tables:
            chunks.append({
                "text": table["structured_text"],
                "metadata": {
                    "type": "table",
                    "table_id": table["table_id"],
                    "page_num": table["page_num"],
                    "section": "table"
                }
            })
        
        return chunks
    
    def _create_chunk(self, text: str, section: dict) -> dict:
        """Create chunk with metadata"""
        return {
            "text": text,
            "metadata": {
                "type": "text",
                "section_title": section["title"],
                "section_level": section["level"],
                "page_num": section["page_num"],
                "context": f"This is content from the '{section['title']}' section of the document."
            }
        }

# Usage
chunker = SemanticChunker(chunk_size=1000, chunk_overlap=200)
chunks = chunker.chunk_by_structure(doc_structure)
print(f"Generated {len(chunks)} chunks")

3.2 Chunking Strategy Comparison

Strategy Pros Cons Best For
Fixed-length chunking Simple implementation Cuts context Simple documents
Paragraph chunking Preserves paragraph integrity Uneven paragraph lengths Normal documents
Semantic structure chunking Preserves document structure Complex implementation Production-grade apps
Section chunking Clear logic Uneven section lengths Long documents

3.3 Vector Index Construction

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

class VectorIndexer:
    """Vector index builder"""
    
    def __init__(self, embedding_model: str = "text-embedding-3-small"):
        self.embeddings = OpenAIEmbeddings(model=embedding_model)
        self.vectorstore = None
    
    def build_index(self, chunks: list, collection_name: str = "pdf_rag"):
        """Build vector index"""
        texts = [chunk["text"] for chunk in chunks]
        metadatas = [chunk["metadata"] for chunk in chunks]
        
        self.vectorstore = Chroma.from_texts(
            texts=texts,
            embedding=self.embeddings,
            metadatas=metadatas,
            collection_name=collection_name
        )
        
        return self.vectorstore
    
    def search(self, query: str, top_k: int = 5) -> list:
        """Retrieve relevant chunks"""
        if not self.vectorstore:
            raise ValueError("Please build index first")
        
        results = self.vectorstore.similarity_search(query, k=top_k)
        return results
    
    def search_with_metadata(self, query: str, filter_dict: dict, top_k: int = 5) -> list:
        """Search with metadata filtering"""
        results = self.vectorstore.similarity_search(
            query, k=top_k, filter=filter_dict
        )
        return results

# Usage
indexer = VectorIndexer()
vectorstore = indexer.build_index(chunks)
results = indexer.search("What is the contract duration?")
for r in results:
    print(f"[{r.metadata['section_title']}] {r.page_content[:100]}...")

4. Layer 4: Agent Invocation — Encapsulate as Toolchain

4.1 PDF Agent Toolchain

from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate

class PDFAgentTools:
    """PDF Agent toolchain"""
    
    def __init__(self, vectorstore, doc_structure: DocumentStructure):
        self.vectorstore = vectorstore
        self.doc_structure = doc_structure
        self.tools = self._create_tools()
    
    def _create_tools(self) -> list:
        return [
            Tool(
                name="search_pdf",
                func=self._search_pdf,
                description="Retrieve relevant snippets from PDF. Input: question"
            ),
            Tool(
                name="read_page",
                func=self._read_page,
                description="Read complete content of specified page. Input: page number"
            ),
            Tool(
                name="extract_table",
                func=self._extract_table,
                description="Extract specified table. Input: table ID"
            ),
            Tool(
                name="analyze_chart",
                func=self._analyze_chart,
                description="Analyze chart content. Input: chart ID"
            ),
            Tool(
                name="quote_source",
                func=self._quote_source,
                description="Return citation source. Input: chunk content"
            )
        ]
    
    def _search_pdf(self, query: str) -> str:
        results = self.vectorstore.similarity_search(query, k=5)
        return "\n\n".join([
            f"[{r.metadata.get('section_title', 'Unknown')}] {r.page_content}"
            for r in results
        ])
    
    def _read_page(self, page_num: int) -> str:
        # Read specified page from original document
        return f"Complete content of page {page_num}..."
    
    def _extract_table(self, table_id: str) -> str:
        for table in self.doc_structure.tables:
            if table["table_id"] == table_id:
                return table["structured_text"]
        return f"Table {table_id} not found"
    
    def _analyze_chart(self, figure_id: str) -> str:
        for figure in self.doc_structure.figures:
            if figure["figure_id"] == figure_id:
                return figure["description"]
        return f"Chart {figure_id} not found"
    
    def _quote_source(self, content: str) -> str:
        results = self.vectorstore.similarity_search(content, k=1)
        if results:
            r = results[0]
            return f"Source: {r.metadata.get('section_title', 'Unknown')}, Page {r.metadata.get('page_num', '?')}"
        return "Source not found"

# Usage
tools = PDFAgentTools(vectorstore, doc_structure)
print(f"Available tools: {[t.name for t in tools.tools]}")

4.2 Agent Invocation Logic

def create_pdf_agent(tools, llm):
    """Create PDF Agent"""
    
    prompt_template = """You are a PDF document assistant. Use the following tools to answer user questions:

{tools}

Use this format:
Question: user question
Thought: reasoning process
Action: tool name
Action Input: tool input
Observation: tool output
... (repeat)
Final Answer: final answer (must include citation sources)

Question: {input}
Thought:"""
    
    prompt = PromptTemplate.from_template(prompt_template)
    agent = create_react_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
    
    return agent_executor

# Usage
# agent = create_pdf_agent(tools.tools, llm)
# result = agent.invoke({"input": "What is the contract duration?"})

5. Production-Grade Must-Have Details

5.1 Traceable Citations

class CitationManager:
    """Citation manager"""
    
    def add_citations(self, answer: str, sources: list) -> str:
        """Add citations to answer"""
        cited_answer = answer + "\n\n**Sources:**\n"
        for i, source in enumerate(sources, 1):
            cited_answer += f"[{i}] {source['section_title']}, Page {source['page_num']}\n"
        return cited_answer
    
    def verify_citation(self, answer: str, source_text: str) -> bool:
        """Verify citation accuracy (prevent hallucination)"""
        # Simple implementation: check if key info from answer is in source text
        # In production, use NLI models for stricter verification
        return True

# Usage
citation_mgr = CitationManager()
cited_answer = citation_mgr.add_citations(answer, sources)

5.2 Evaluation Loop

class RAGEvaluator:
    """RAG evaluator"""
    
    def evaluate(self, test_cases: list, agent) -> dict:
        """Evaluate RAG system"""
        results = {
            "retrieval_accuracy": 0,  # Retrieved snippets match original text
            "page_accuracy": 0,       # Page numbers correct
            "table_accuracy": 0,      # Tables parsed accurately
            "ocr_accuracy": 0,        # OCR missed characters
            "chart_accuracy": 0,      # Chart info correctly understood
            "overall_score": 0
        }
        
        for case in test_cases:
            # Run Agent
            answer = agent.invoke({"input": case["question"]})
            
            # Evaluate retrieval accuracy
            if case["expected_section"] in answer:
                results["retrieval_accuracy"] += 1
            
            # Evaluate page accuracy
            if str(case["expected_page"]) in answer:
                results["page_accuracy"] += 1
        
        # Calculate averages
        n = len(test_cases)
        for key in results:
            if key != "overall_score":
                results[key] /= n
        
        results["overall_score"] = sum(
            results[k] for k in results if k != "overall_score"
        ) / 5
        
        return results

# Usage
evaluator = RAGEvaluator()
test_cases = [
    {"question": "What is the contract duration?", "expected_section": "Contract Duration", "expected_page": 3},
    {"question": "What is the annual growth rate?", "expected_section": "Financial Data", "expected_page": 12},
]
# results = evaluator.evaluate(test_cases, agent)

6. Technology Selection Summary

Layer Recommended Tools Alternatives
Native text parsing PyMuPDF pdfplumber, PyPDF2
OCR Tesseract + pytesseract PaddleOCR, EasyOCR
Multimodal parsing GPT-4o / Claude Gemini, local models
Table extraction Camelot Tabula, pdfplumber
Vector database Chroma / Pinecone Weaviate, Milvus
Embedding text-embedding-3-small Cohere, local models
Agent framework LangChain LlamaIndex, custom

7. Summary

Layer Core Key Output
Parsing Layer Classify first, then parse Text + images + tables
Structure Restoration Preserve native structure Sections + tables + figures
Chunking & Indexing Chunk by semantic structure, with metadata High-quality chunks + vector index
Agent Invocation Encapsulate toolchain, invoke on demand Traceable answers

One-sentence summary: Production-grade PDF RAG is not "convert PDF to text then vectorize" — it's a complete engineering project with four layers: parsing, restoration, chunking, and invocation, each requiring careful design.


📌 Related Reading: