生产级 PDF RAG 完全指南:从解析到评测的四层架构实战

生产级 PDF RAG 完全指南:从解析到评测的四层架构实战

在上一篇文章《面试官问你:RAG 如何处理 PDF》中,我们给出了面试场景下的标准回答框架。这篇文章面向想深入理解生产级 PDF RAG 底层实现的开发者,把四层架构讲透——每层都有代码实现、技术选型和实战技巧。

一、第 1 层:解析层——先判断 PDF 类型,再选解析方式

1.1 PDF 类型识别

import fitz  # PyMuPDF

def detect_pdf_type(pdf_path: str) -> str:
    """识别 PDF 类型:原生文本 / 扫描件 / 图文混排"""
    doc = fitz.open(pdf_path)
    
    text_ratio = 0
    image_ratio = 0
    
    for page in doc:
        # 提取文本
        text = page.get_text()
        text_area = len(text) * 100  # 粗略估算文本面积
        
        # 提取图片
        images = page.get_images()
        image_area = sum(img[2] * img[3] for img in images)  # width * height
        
        page_area = page.rect.width * page.rect.height
        text_ratio += text_area / page_area if page_area > 0 else 0
        image_ratio += image_area / page_area if page_area > 0 else 0
    
    doc.close()
    
    avg_text_ratio = text_ratio / len(doc) if len(doc) > 0 else 0
    avg_image_ratio = image_ratio / len(doc) if len(doc) > 0 else 0
    
    if avg_text_ratio > 0.3:
        return "native_text"  # 原生文本 PDF
    elif avg_image_ratio > 0.5:
        return "scanned"      # 扫描件 PDF
    else:
        return "mixed"        # 图文混排 PDF

# 使用
pdf_type = detect_pdf_type("contract.pdf")
print(f"PDF 类型: {pdf_type}")

1.2 原生文本 PDF 解析

import fitz

class NativeTextParser:
    """原生文本 PDF 解析器"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
    
    def extract_structured_text(self) -> list:
        """提取带结构信息的文本"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # 获取文本块(带位置信息)
            blocks = page.get_text("dict")["blocks"]
            
            page_content = {
                "page_num": page_num + 1,
                "blocks": []
            }
            
            for block in blocks:
                if block["type"] == 0:  # 文本块
                    for line in block["lines"]:
                        text = "".join([span["text"] for span in line["spans"]])
                        font_size = line["spans"][0]["size"] if line["spans"] else 12
                        
                        page_content["blocks"].append({
                            "type": "text",
                            "text": text,
                            "font_size": font_size,
                            "bbox": line["bbox"],
                            "is_heading": font_size > 14  # 简单判断标题
                        })
                elif block["type"] == 1:  # 图片块
                    page_content["blocks"].append({
                        "type": "image",
                        "bbox": block["bbox"]
                    })
            
            pages.append(page_content)
        
        return pages
    
    def extract_tables(self, page_num: int) -> list:
        """提取表格(需要 camelot 或 tabula)"""
        try:
            import camelot
            tables = camelot.read_pdf(self.pdf_path, pages=str(page_num + 1))
            return [table.df.to_dict() for table in tables]
        except ImportError:
            print("请安装 camelot: pip install camelot-py")
            return []

# 使用
parser = NativeTextParser("report.pdf")
pages = parser.extract_structured_text()
print(f"提取了 {len(pages)} 页")

1.3 扫描件 PDF 解析(OCR)

import pytesseract
from PIL import Image
import fitz

class OCRParser:
    """扫描件 PDF 解析器(OCR)"""
    
    def __init__(self, pdf_path: str, lang: str = "chi_sim+eng"):
        self.pdf_path = pdf_path
        self.lang = lang
        self.doc = fitz.open(pdf_path)
    
    def extract_text(self, dpi: int = 300) -> list:
        """通过 OCR 提取文本"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # 将页面转为图片
            mat = fitz.Matrix(dpi / 72, dpi / 72)  # 72 is default DPI
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # OCR 识别
            text = pytesseract.image_to_string(img, lang=self.lang)
            
            pages.append({
                "page_num": page_num + 1,
                "text": text,
                "dpi": dpi
            })
        
        return pages
    
    def get_confidence(self, dpi: int = 300) -> list:
        """获取 OCR 置信度(用于质量评估)"""
        pages = []
        for page_num, page in enumerate(self.doc):
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # 获取详细数据(含置信度)
            data = pytesseract.image_to_data(img, lang=self.lang, output_type=pytesseract.Output.DICT)
            confidences = [int(c) for c in data["conf"] if int(c) > 0]
            
            avg_conf = sum(confidences) / len(confidences) if confidences else 0
            pages.append({
                "page_num": page_num + 1,
                "avg_confidence": avg_conf,
                "low_conf_words": sum(1 for c in confidences if c < 60)
            })
        
        return pages

# 使用
ocr_parser = OCRParser("scanned_contract.pdf")
pages = ocr_parser.extract_text()
confidence = ocr_parser.get_confidence()
print(f"OCR 平均置信度: {confidence[0]['avg_confidence']:.1f}%")

1.4 图文混排 PDF 解析(多模态)

import base64
import fitz

class MultimodalParser:
    """图文混排 PDF 解析器(多模态)"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
    
    def extract_with_images(self) -> list:
        """提取文本和图片(图片用 base64 编码)"""
        pages = []
        for page_num, page in enumerate(self.doc):
            # 提取文本
            text = page.get_text()
            
            # 提取图片
            images = []
            for img in page.get_images():
                xref = img[0]
                base_image = self.doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_b64 = base64.b64encode(image_bytes).decode("utf-8")
                images.append({
                    "xref": xref,
                    "base64": image_b64,
                    "extension": base_image["ext"]
                })
            
            pages.append({
                "page_num": page_num + 1,
                "text": text,
                "images": images
            })
        
        return pages
    
    def analyze_with_vision(self, api_key: str) -> list:
        """用视觉模型分析图片内容"""
        import openai
        
        pages = self.extract_with_images()
        analyzed_pages = []
        
        for page in pages:
            image_descriptions = []
            for img in page["images"]:
                response = openai.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "user", "content": [
                            {"type": "text", "text": "请描述这张图片的内容,包括图表类型、数据、文字等。"},
                            {"type": "image_url", "image_url": {
                                "url": f"data:image/{img['extension']};base64,{img['base64']}"
                            }}
                        ]}
                    ],
                    max_tokens=500
                )
                image_descriptions.append(response.choices[0].message.content)
            
            analyzed_pages.append({
                "page_num": page["page_num"],
                "text": page["text"],
                "image_descriptions": image_descriptions
            })
        
        return analyzed_pages

# 使用
mm_parser = MultimodalParser("report_with_charts.pdf")
pages = mm_parser.extract_with_images()
# analyzed = mm_parser.analyze_with_vision("your-api-key")  # 需要 API key

1.5 混合解析策略——自动调度器

在实际生产中,一份 PDF 的不同页面可能同时包含文本段、扫描区域和图表。单一解析器往往无法覆盖所有场景,需要一个混合解析调度器来按页自动选择最优解析方式。该调度器的核心思路是先对每页做快速分类,再分发到对应的解析器,最后合并结果。

class HybridPDFParser:
    """混合解析调度器:按页自动选择最优解析器"""
    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.native_parser = NativeTextParser(pdf_path)
        self.ocr_parser = OCRParser(pdf_path)
        self.multimodal_parser = MultimodalParser(pdf_path)
    
    def parse(self) -> list:
        """按页混合解析"""
        # 先做类型检测
        pdf_type = detect_pdf_type(self.pdf_path)
        
        if pdf_type == "native_text":
            return self.native_parser.extract_structured_text()
        elif pdf_type == "scanned":
            return self.ocr_parser.extract_text()
        else:
            # 图文混排:逐页判断,选择最佳解析器
            return self._parse_mixed()
    
    def _parse_mixed(self) -> list:
        """逐页解析混合类型 PDF"""
        native_pages = self.native_parser.extract_structured_text()
        ocr_pages = self.ocr_parser.extract_text()
        
        combined = []
        for i, (native, ocr) in enumerate(zip(native_pages, ocr_pages)):
            native_text_len = sum(
                len(b["text"]) for b in native["blocks"] if b["type"] == "text"
            )
            ocr_text_len = len(ocr["text"])
            
            # 如果原生解析文本量远超 OCR(2倍以上),选原生
            if native_text_len > ocr_text_len * 2:
                combined.append(native)
            # 如果 OCR 文本量远超原生,说明该页本质是扫描区域
            elif ocr_text_len > native_text_len * 2:
                combined.append(ocr)
            else:
                # 两者接近:合并结果,去重
                merged = native.copy()
                merged["text"] = ocr["text"]
                combined.append(merged)
        
        return combined

这个调度器的优势在于避免了"一刀切"的解析方式带来的信息损失。例如一份技术报告,前几页是原生文本目录,中间某页是扫描件影印,后面又是原生文本正文——混合调度器能为每一页独立选择最佳策略,保证整体解析质量。

二、第 2 层:结构还原——保留 PDF 原生结构

2.1 文档结构还原器

class DocumentStructure:
    """文档结构还原"""
    
    def __init__(self):
        self.title = ""
        self.sections = []  # 章节列表
        self.tables = []    # 表格列表
        self.figures = []   # 图片列表
        self.footnotes = [] # 脚注列表
    
    def add_section(self, level: int, title: str, page_num: int, content: str):
        self.sections.append({
            "level": level,
            "title": title,
            "page_num": page_num,
            "content": content
        })
    
    def add_table(self, table_id: str, page_num: int, caption: str, data: dict):
        self.tables.append({
            "table_id": table_id,
            "page_num": page_num,
            "caption": caption,
            "data": data
        })
    
    def add_figure(self, figure_id: str, page_num: int, caption: str, description: str):
        self.figures.append({
            "figure_id": figure_id,
            "page_num": page_num,
            "caption": caption,
            "description": description
        })

class StructureExtractor:
    """从解析结果中提取文档结构"""
    
    def extract(self, pages: list) -> DocumentStructure:
        doc = DocumentStructure()
        current_section = None
        
        for page in pages:
            for block in page.get("blocks", []):
                if block["type"] == "text":
                    text = block["text"].strip()
                    
                    # 检测标题(字体大小 > 14)
                    if block.get("is_heading") and len(text) < 100:
                        level = 1 if block["font_size"] > 18 else 2
                        current_section = {
                            "level": level,
                            "title": text,
                            "page_num": page["page_num"],
                            "content": ""
                        }
                        doc.add_section(level, text, page["page_num"], "")
                    elif current_section:
                        current_section["content"] += text + "\n"
        
        return doc

# 使用
extractor = StructureExtractor()
doc_structure = extractor.extract(pages)
print(f"提取了 {len(doc_structure.sections)} 个章节")

2.2 表格结构化处理

import pandas as pd

class TableProcessor:
    """表格结构化处理"""
    
    def process_table(self, table_data: dict, table_id: str, page_num: int) -> dict:
        """将表格转为结构化文本"""
        df = pd.DataFrame(table_data)
        
        # 生成表格描述
        description = f"表格 {table_id}(第 {page_num} 页):\n"
        description += f"列名:{', '.join(df.columns.tolist())}\n"
        description += f"行数:{len(df)}\n"
        description += "数据:\n"
        
        # 逐行描述
        for idx, row in df.iterrows():
            row_desc = f"  第 {idx + 1} 行:"
            row_desc += ", ".join([f"{col}={val}" for col, val in row.items()])
            description += row_desc + "\n"
        
        return {
            "table_id": table_id,
            "page_num": page_num,
            "columns": df.columns.tolist(),
            "data": table_data,
            "structured_text": description
        }
    
    def extract_with_camelot(self, pdf_path: str, page_num: int) -> list:
        """使用 camelot 提取表格"""
        try:
            import camelot
            tables = camelot.read_pdf(pdf_path, pages=str(page_num))
            results = []
            for i, table in enumerate(tables):
                results.append({
                    "table_id": f"table_{page_num}_{i}",
                    "page_num": page_num,
                    "data": table.df.to_dict(),
                    "accuracy": table.parsing_report.get("accuracy", 0)
                })
            return results
        except ImportError:
            print("请安装 camelot: pip install camelot-py")
            return []

# 使用
processor = TableProcessor()
# structured_table = processor.process_table(table_data, "table_1", 5)

2.3 目录与层级关系提取

长文档通常包含多级目录、交叉引用和脚注等层级关系。忽略这些信息会导致 Agent 在回答"参见第 3.2 节"这类问题时无从下手。一个完整的结构还原器需要建立章节间的前后引用关系,为后续检索提供上下文线索。

class TOCExtractor:
    """目录与章节层级关系提取器"""
    
    def __init__(self, doc: DocumentStructure):
        self.doc = doc
    
    def build_section_tree(self) -> dict:
        """构建章节树"""
        tree = {"children": [], "title": self.doc.title}
        stack = [tree]
        
        for section in self.doc.sections:
            node = {
                "title": section["title"],
                "level": section["level"],
                "page_num": section["page_num"],
                "children": []
            }
            
            # 找到合适的父节点
            while len(stack) > 1 and stack[-1].get("level", 0) >= section["level"]:
                stack.pop()
            
            stack[-1]["children"].append(node)
            stack.append(node)
        
        return tree
    
    def extract_cross_references(self, text: str) -> list:
        """提取交叉引用(如"参见第 3.2 节"、"如表 5-1 所示")"""
        import re
        refs = []
        
        # 匹配"第 X.Y 节"、"第 X 章"等
        section_refs = re.findall(r'第\s*(\d+(?:\.\d+)*)\s*(?:节|章)', text)
        refs.extend([{"type": "section", "ref": r} for r in section_refs])
        
        # 匹配"表 X-Y"、"图 X-Y"等
        table_refs = re.findall(r'(?:表|图表)\s*(\d+-\d+)', text)
        refs.extend([{"type": "table", "ref": r} for r in table_refs])
        
        # 匹配"图 X-Y"
        figure_refs = re.findall(r'图\s*(\d+-\d+)', text)
        refs.extend([{"type": "figure", "ref": r} for r in figure_refs])
        
        return refs

建立章节树后,Agent 在回答问题时可以根据引用关系跳转到相关章节,而不是仅仅依赖向量相似度。这种结构感知的方式在处理"请对比第 3 章和第 5 章的结论"这类问题时尤其有效。

三、第 3 层:切片和索引——按语义结构切

3.1 语义切片策略

class SemanticChunker:
    """语义切片器"""
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def chunk_by_structure(self, doc: DocumentStructure) -> list:
        """按文档结构切片"""
        chunks = []
        
        for section in doc.sections:
            # 标题 + 内容作为一个 chunk
            chunk_text = f"# {section['title']}\n\n{section['content']}"
            
            # 如果内容太长,再按段落切
            if len(chunk_text) > self.chunk_size:
                paragraphs = section['content'].split('\n\n')
                current_chunk = f"# {section['title']}\n\n"
                
                for para in paragraphs:
                    if len(current_chunk) + len(para) < self.chunk_size:
                        current_chunk += para + "\n\n"
                    else:
                        chunks.append(self._create_chunk(current_chunk, section))
                        current_chunk = f"# {section['title']}\n\n{para}\n\n"
                
                if current_chunk.strip():
                    chunks.append(self._create_chunk(current_chunk, section))
            else:
                chunks.append(self._create_chunk(chunk_text, section))
        
        # 表格单独切片
        for table in doc.tables:
            chunks.append({
                "text": table["structured_text"],
                "metadata": {
                    "type": "table",
                    "table_id": table["table_id"],
                    "page_num": table["page_num"],
                    "section": "table"
                }
            })
        
        return chunks
    
    def _create_chunk, text: str, section: dict) -> dict:
        """创建带 metadata 的 chunk"""
        return {
            "text": text,
            "metadata": {
                "type": "text",
                "section_title": section["title"],
                "section_level": section["level"],
                "page_num": section["page_num"],
                "context": f"这是文档中'{section['title']}'章节的内容。"
            }
        }

# 使用
chunker = SemanticChunker(chunk_size=1000, chunk_overlap=200)
chunks = chunker.chunk_by_structure(doc_structure)
print(f"生成了 {len(chunks)} 个 chunk")

3.2 切片策略对比

策略 优点 缺点 适用场景
固定长度切片 实现简单 切断上下文 简单文档
按段落切片 保留段落完整性 段落长度不一 普通文档
按语义结构切片 保留文档结构 实现复杂 生产级应用
按章节切片 逻辑清晰 章节长度差异大 长文档

3.3 向量索引构建

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

class VectorIndexer:
    """向量索引构建器"""
    
    def __init__(self, embedding_model: str = "text-embedding-3-small"):
        self.embeddings = OpenAIEmbeddings(model=embedding_model)
        self.vectorstore = None
    
    def build_index(self, chunks: list, collection_name: str = "pdf_rag"):
        """构建向量索引"""
        texts = [chunk["text"] for chunk in chunks]
        metadatas = [chunk["metadata"] for chunk in chunks]
        
        self.vectorstore = Chroma.from_texts(
            texts=texts,
            embedding=self.embeddings,
            metadatas=metadatas,
            collection_name=collection_name
        )
        
        return self.vectorstore
    
    def search(self, query: str, top_k: int = 5) -> list:
        """检索相关 chunk"""
        if not self.vectorstore:
            raise ValueError("请先构建索引")
        
        results = self.vectorstore.similarity_search(query, k=top_k)
        return results
    
    def search_with_metadata(self, query: str, filter_dict: dict, top_k: int = 5) -> list:
        """带 metadata 过滤的检索"""
        results = self.vectorstore.similarity_search(
            query, k=top_k, filter=filter_dict
        )
        return results

# 使用
indexer = VectorIndexer()
vectorstore = indexer.build_index(chunks)
results = indexer.search("合同期限是多久?")
for r in results:
    print(f"[{r.metadata['section_title']}] {r.page_content[:100]}...")

3.4 混合检索——向量 + 关键词 + 知识图谱

单纯依赖向量相似度检索在处理精确匹配查询时表现不佳。例如用户搜索"第 3.2 节提到的 API 限流策略",关键词"3.2"在向量空间中并不具有特殊语义。生产级 PDF RAG 通常需要混合检索策略,将向量检索与关键词检索结合,并通过 RRF(Reciprocal Rank Fusion)或加权评分的方式合并结果。

class HybridSearchEngine:
    """混合检索引擎:向量 + 关键词 + 结构化过滤"""
    
    def __init__(self, vectorstore, chunks: list):
        self.vectorstore = vectorstore
        self.chunks = chunks
        # 构建关键词倒排索引
        self.inverted_index = self._build_inverted_index()
    
    def _build_inverted_index(self) -> dict:
        """构建倒排索引"""
        from collections import defaultdict
        index = defaultdict(set)
        
        for i, chunk in enumerate(self.chunks):
            words = chunk["text"].lower().split()
            for word in words:
                index[word].add(i)
        
        return index
    
    def keyword_search(self, query: str, top_k: int = 5) -> list:
        """关键词搜索(BM25 简化版)"""
        query_words = query.lower().split()
        scores = {}
        
        for word in query_words:
            if word in self.inverted_index:
                for chunk_idx in self.inverted_index[word]:
                    scores[chunk_idx] = scores.get(chunk_idx, 0) + 1
        
        # 按评分排序
        sorted_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
        return [self.chunks[i] for i in sorted_indices[:top_k]]
    
    def hybrid_search(self, query: str, top_k: int = 5,
                      vector_weight: float = 0.6,
                      keyword_weight: float = 0.4) -> list:
        """混合搜索:加权合并向量和关键词结果"""
        # 向量检索
        vector_results = self.vectorstore.similarity_search(query, k=top_k * 2)
        
        # 关键词检索
        keyword_results = self.keyword_search(query, top_k=top_k * 2)
        
        # RRF 融合
        chunk_scores = {}
        
        for rank, result in enumerate(vector_results):
            chunk_id = result.page_content[:50]
            chunk_scores[chunk_id] = chunk_scores.get(chunk_id, 0) + vector_weight / (rank + 1)
        
        for rank, result in enumerate(keyword_results):
            chunk_id = result["text"][:50]
            chunk_scores[chunk_id] = chunk_scores.get(chunk_id, 0) + keyword_weight / (rank + 1)
        
        # 排序返回
        sorted_ids = sorted(chunk_scores.keys(), key=lambda x: chunk_scores[x], reverse=True)
        return sorted_ids[:top_k]

混合检索把语义理解能力精确匹配能力相结合,在工程实践中通常能提升 15% 到 30% 的检索准确率。尤其是针对 PDF 文档中常见的关键词查询(如术语、编号、表格标题等),关键词检索几乎是不可或缺的补充手段。

四、第 4 层:Agent 调用——封装成工具链

4.1 PDF Agent 工具链

from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate

class PDFAgentTools:
    """PDF Agent 工具链"""
    
    def __init__(self, vectorstore, doc_structure: DocumentStructure):
        self.vectorstore = vectorstore
        self.doc_structure = doc_structure
        self.tools = self._create_tools()
    
    def _create_tools(self) -> list:
        return [
            Tool(
                name="search_pdf",
                func=self._search_pdf,
                description="检索 PDF 中的相关片段。输入:问题"
            ),
            Tool(
                name="read_page",
                func=self._read_page,
                description="读取指定页的完整内容。输入:页码"
            ),
            Tool(
                name="extract_table",
                func=self._extract_table,
                description="提取指定表格。输入:表格ID"
            ),
            Tool(
                name="analyze_chart",
                func=self._analyze_chart,
                description="分析图表内容。输入:图表ID"
            ),
            Tool(
                name="quote_source",
                func=self._quote_source,
                description="返回引用来源。输入:chunk内容"
            )
        ]
    
    def _search_pdf(self, query: str) -> str:
        results = self.vectorstore.similarity_search(query, k=5)
        return "\n\n".join([
            f"[{r.metadata.get('section_title', '未知')}] {r.page_content}"
            for r in results
        ])
    
    def _read_page(self, page_num: int) -> str:
        # 从原始文档读取指定页
        return f"第 {page_num} 页的完整内容..."
    
    def _extract_table(self, table_id: str) -> str:
        for table in self.doc_structure.tables:
            if table["table_id"] == table_id:
                return table["structured_text"]
        return f"未找到表格 {table_id}"
    
    def _analyze_chart(self, figure_id: str) -> str:
        for figure in self.doc_structure.figures:
            if figure["figure_id"] == figure_id:
                return figure["description"]
        return f"未找到图表 {figure_id}"
    
    def _quote_source(self, content: str) -> str:
        results = self.vectorstore.similarity_search(content, k=1)
        if results:
            r = results[0]
            return f"来源:{r.metadata.get('section_title', '未知')},第 {r.metadata.get('page_num', '?')} 页"
        return "未找到来源"

# 使用
tools = PDFAgentTools(vectorstore, doc_structure)
print(f"可用工具:{[t.name for t in tools.tools]}")

4.2 Agent 调用逻辑

def create_pdf_agent(tools, llm):
    """创建 PDF Agent"""
    
    prompt_template = """你是一个 PDF 文档助手,使用以下工具回答用户问题:

{tools}

使用格式:
Question: 用户问题
Thought: 思考过程
Action: 工具名
Action Input: 工具输入
Observation: 工具输出
... (重复)
Final Answer: 最终回答(必须包含引用来源)

Question: {input}
Thought:"""
    
    prompt = PromptTemplate.from_template(prompt_template)
    agent = create_react_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
    
    return agent_executor

# 使用
# agent = create_pdf_agent(tools.tools, llm)
# result = agent.invoke({"input": "合同期限是多久?"})

4.3 多轮对话上下文管理

在实际生产环境中,用户很少只问一个问题。一个高效的 PDF RAG 系统需要支持多轮对话上下文——用户在第一轮问"合同期限是多久",在第二轮追问"续约条件是什么"时,Agent 应该能记住上一轮的上下文。

class ConversationManager:
    """多轮对话上下文管理"""
    
    def __init__(self, max_history: int = 5):
        self.max_history = max_history
        self.history = []
    
    def add_exchange(self, question: str, answer: str):
        """添加一轮对话"""
        self.history.append({"question": question, "answer": answer})
        # 保留最近的历史
        if len(self.history) > self.max_history:
            self.history = self.history[-self.max_history:]
    
    def build_context(self, current_question: str) -> str:
        """构建带上下文的提示"""
        if not self.history:
            return current_question
        
        context = "以下是之前的对话历史:\n"
        for i, exchange in enumerate(self.history, 1):
            context += f"第 {i} 轮:\n"
            context += f"  用户:{exchange['question']}\n"
            context += f"  助手:{exchange['answer'][:200]}...\n\n"
        context += f"\n基于以上上下文,请回答:{current_question}"
        
        return context

通过显式维护对话历史并将其注入到 Agent 的推理上下文中,系统可以在多轮对话中保持连贯性,避免用户重复描述问题背景,显著提升交互体验。

五、生产级必加细节

5.1 可追溯引用

class CitationManager:
    """引用管理器"""
    
    def add_citations(self, answer: str, sources: list) -> str:
        """在回答中添加引用"""
        cited_answer = answer + "\n\n**参考来源:**\n"
        for i, source in enumerate(sources, 1):
            cited_answer += f"[{i}] {source['section_title']},第 {source['page_num']} 页\n"
        return cited_answer
    
    def verify_citation(self, answer: str, source_text: str) -> bool:
        """验证引用是否准确(防止幻觉)"""
        # 简单实现:检查回答中的关键信息是否在源文本中
        # 实际项目中可以用 NLI 模型做更严格的验证
        return True

# 使用
citation_mgr = CitationManager()
cited_answer = citation_mgr.add_citations(answer, sources)

5.2 检索结果重排序(Reranking)

初次检索返回的 top-k 结果中,排名靠后的片段往往与问题关联度不高。在检索和生成之间加一层重排序模型,可以显著提升送入 LLM 上下文的质量。常见的做法是使用 Cross-Encoder(如 cross-encoder/ms-marco-MiniLM-L-6-v2)对候选文档和查询重新打分。

from sentence_transformers import CrossEncoder

class Reranker:
    """检索结果重排序器"""
    
    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.model = CrossEncoder(model_name)
    
    def rerank(self, query: str, candidates: list, top_k: int = 5) -> list:
        """对候选文档重排序"""
        pairs = [(query, doc.page_content if hasattr(doc, 'page_content') else doc["text"]) 
                 for doc in candidates]
        
        scores = self.model.predict(pairs)
        
        # 按分数排序
        scored_candidates = list(zip(candidates, scores))
        scored_candidates.sort(key=lambda x: x[1], reverse=True)
        
        return [item[0] for item in scored_candidates[:top_k]]

使用重排序后的效果:初次检索 top-20,经 Reranker 筛选后取 top-5 送入 LLM,既控制了 token 消耗,又保留了最相关的上下文。这对于长文档的检索效果提升尤为明显。

5.3 增量索引与缓存策略

生产环境中 PDF 文档会频繁更新(合同修订、报告更新等),每次都全量重建索引代价太大。合理的做法是引入增量索引解析结果缓存机制。

import hashlib
import json
import os

class IncrementalIndexer:
    """增量索引管理器"""
    
    def __init__(self, cache_dir: str = ".cache/pdf_rag"):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
    
    def get_file_hash(self, pdf_path: str) -> str:
        """计算文件哈希(用于检测变更)"""
        hasher = hashlib.sha256()
        with open(pdf_path, "rb") as f:
            hasher.update(f.read())
        return hasher.hexdigest()
    
    def load_cached(self, pdf_path: str) -> dict:
        """加载缓存的解析结果"""
        file_hash = self.get_file_hash(pdf_path)
        cache_file = os.path.join(self.cache_dir, f"{file_hash}.json")
        
        if os.path.exists(cache_file):
            with open(cache_file, "r", encoding="utf-8") as f:
                return json.load(f)
        return None
    
    def save_cache(self, pdf_path: str, data: dict):
        """缓存解析结果"""
        file_hash = self.get_file_hash(pdf_path)
        cache_file = os.path.join(self.cache_dir, f"{file_hash}.json")
        
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    
    def should_rebuild(self, pdf_path: str) -> bool:
        """判断是否需要重建索引"""
        cached = self.load_cached(pdf_path)
        return cached is None  # 缓存不存在则需重建

这套缓存机制的思路是:只要 PDF 文件没有变化(SHA-256 哈希不变),就直接使用之前缓存的解析结果、结构还原和切片数据,仅在文件发生变化时才重新跑整个解析流程。这对于包含大量 PDF 的知识库运维场景,能节省大量重复计算开销。

5.4 评测闭环

class RAGEvaluator:
    """RAG 评测器"""
    
    def evaluate(self, test_cases: list, agent) -> dict:
        """评测 RAG 系统"""
        results = {
            "retrieval_accuracy": 0,  # 检索片段是否命中原文
            "page_accuracy": 0,       # 页码是否正确
            "table_accuracy": 0,      # 表格是否解析准确
            "ocr_accuracy": 0,        # OCR 是否漏字
            "chart_accuracy": 0,      # 图表信息是否正确
            "overall_score": 0
        }
        
        for case in test_cases:
            # 运行 Agent
            answer = agent.invoke({"input": case["question"]})
            
            # 评测检索准确性
            if case["expected_section"] in answer:
                results["retrieval_accuracy"] += 1
            
            # 评测页码准确性
            if str(case["expected_page"]) in answer:
                results["page_accuracy"] += 1
        
        # 计算平均分
        n = len(test_cases)
        for key in results:
            if key != "overall_score":
                results[key] /= n
        
        results["overall_score"] = sum(
            results[k] for k in results if k != "overall_score"
        ) / 5
        
        return results

# 使用
evaluator = RAGEvaluator()
test_cases = [
    {"question": "合同期限是多久?", "expected_section": "合同期限", "expected_page": 3},
    {"question": "年增长率是多少?", "expected_section": "财务数据", "expected_page": 12},
]
# results = evaluator.evaluate(test_cases, agent)

六、技术选型总结

层级 推荐工具 备选方案
原生文本解析 PyMuPDF pdfplumber, PyPDF2
OCR Tesseract + pytesseract PaddleOCR, EasyOCR
多模态解析 GPT-4o / Claude Gemini, 本地模型
表格提取 Camelot Tabula, pdfplumber
向量数据库 Chroma / Pinecone Weaviate, Milvus
Embedding text-embedding-3-small Cohere, 本地模型
Agent 框架 LangChain LlamaIndex, 自建
重排序 Cross-Encoder Cohere Rerank, bge-reranker

七、总结

层级 核心 关键产出
解析层 先分类,再解析 文本 + 图片 + 表格
结构还原 保留原生结构 章节 + 表格 + 图片
切片索引 按语义切,带 metadata 高质量 chunk + 向量索引
Agent 调用 封装工具链,按需调用 可追溯的回答

一句话总结: 生产级 PDF RAG 不是"PDF 转文本再向量化",而是四层架构的完整工程——解析、还原、切片、调用,每个环节都需要精心设计。混合解析策略解决了不同类型页面的一体化处理问题,混合检索与重排序提升了召回准确率,增量索引与缓存让系统在大规模场景下保持高效运转。这四层加上检索增强和评测闭环,构成了一个真正可用的 PDF 知识助手系统。


📌 关联阅读: