生产级 PDF RAG 完全指南:从解析到评测的四层架构实战
在上一篇文章《面试官问你:RAG 如何处理 PDF》中,我们给出了面试场景下的标准回答框架。这篇文章面向想深入理解生产级 PDF RAG 底层实现的开发者,把四层架构讲透——每层都有代码实现、技术选型和实战技巧。
一、第 1 层:解析层——先判断 PDF 类型,再选解析方式
1.1 PDF 类型识别
import fitz # PyMuPDF
def detect_pdf_type(pdf_path: str) -> str:
"""识别 PDF 类型:原生文本 / 扫描件 / 图文混排"""
doc = fitz.open(pdf_path)
text_ratio = 0
image_ratio = 0
for page in doc:
# 提取文本
text = page.get_text()
text_area = len(text) * 100 # 粗略估算文本面积
# 提取图片
images = page.get_images()
image_area = sum(img[2] * img[3] for img in images) # width * height
page_area = page.rect.width * page.rect.height
text_ratio += text_area / page_area if page_area > 0 else 0
image_ratio += image_area / page_area if page_area > 0 else 0
doc.close()
avg_text_ratio = text_ratio / len(doc) if len(doc) > 0 else 0
avg_image_ratio = image_ratio / len(doc) if len(doc) > 0 else 0
if avg_text_ratio > 0.3:
return "native_text" # 原生文本 PDF
elif avg_image_ratio > 0.5:
return "scanned" # 扫描件 PDF
else:
return "mixed" # 图文混排 PDF
# 使用
pdf_type = detect_pdf_type("contract.pdf")
print(f"PDF 类型: {pdf_type}")
1.2 原生文本 PDF 解析
import fitz
class NativeTextParser:
"""原生文本 PDF 解析器"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def extract_structured_text(self) -> list:
"""提取带结构信息的文本"""
pages = []
for page_num, page in enumerate(self.doc):
# 获取文本块(带位置信息)
blocks = page.get_text("dict")["blocks"]
page_content = {
"page_num": page_num + 1,
"blocks": []
}
for block in blocks:
if block["type"] == 0: # 文本块
for line in block["lines"]:
text = "".join([span["text"] for span in line["spans"]])
font_size = line["spans"][0]["size"] if line["spans"] else 12
page_content["blocks"].append({
"type": "text",
"text": text,
"font_size": font_size,
"bbox": line["bbox"],
"is_heading": font_size > 14 # 简单判断标题
})
elif block["type"] == 1: # 图片块
page_content["blocks"].append({
"type": "image",
"bbox": block["bbox"]
})
pages.append(page_content)
return pages
def extract_tables(self, page_num: int) -> list:
"""提取表格(需要 camelot 或 tabula)"""
try:
import camelot
tables = camelot.read_pdf(self.pdf_path, pages=str(page_num + 1))
return [table.df.to_dict() for table in tables]
except ImportError:
print("请安装 camelot: pip install camelot-py")
return []
# 使用
parser = NativeTextParser("report.pdf")
pages = parser.extract_structured_text()
print(f"提取了 {len(pages)} 页")
1.3 扫描件 PDF 解析(OCR)
import pytesseract
from PIL import Image
import fitz
class OCRParser:
"""扫描件 PDF 解析器(OCR)"""
def __init__(self, pdf_path: str, lang: str = "chi_sim+eng"):
self.pdf_path = pdf_path
self.lang = lang
self.doc = fitz.open(pdf_path)
def extract_text(self, dpi: int = 300) -> list:
"""通过 OCR 提取文本"""
pages = []
for page_num, page in enumerate(self.doc):
# 将页面转为图片
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default DPI
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# OCR 识别
text = pytesseract.image_to_string(img, lang=self.lang)
pages.append({
"page_num": page_num + 1,
"text": text,
"dpi": dpi
})
return pages
def get_confidence(self, dpi: int = 300) -> list:
"""获取 OCR 置信度(用于质量评估)"""
pages = []
for page_num, page in enumerate(self.doc):
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# 获取详细数据(含置信度)
data = pytesseract.image_to_data(img, lang=self.lang, output_type=pytesseract.Output.DICT)
confidences = [int(c) for c in data["conf"] if int(c) > 0]
avg_conf = sum(confidences) / len(confidences) if confidences else 0
pages.append({
"page_num": page_num + 1,
"avg_confidence": avg_conf,
"low_conf_words": sum(1 for c in confidences if c < 60)
})
return pages
# 使用
ocr_parser = OCRParser("scanned_contract.pdf")
pages = ocr_parser.extract_text()
confidence = ocr_parser.get_confidence()
print(f"OCR 平均置信度: {confidence[0]['avg_confidence']:.1f}%")
1.4 图文混排 PDF 解析(多模态)
import base64
import fitz
class MultimodalParser:
"""图文混排 PDF 解析器(多模态)"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def extract_with_images(self) -> list:
"""提取文本和图片(图片用 base64 编码)"""
pages = []
for page_num, page in enumerate(self.doc):
# 提取文本
text = page.get_text()
# 提取图片
images = []
for img in page.get_images():
xref = img[0]
base_image = self.doc.extract_image(xref)
image_bytes = base_image["image"]
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
images.append({
"xref": xref,
"base64": image_b64,
"extension": base_image["ext"]
})
pages.append({
"page_num": page_num + 1,
"text": text,
"images": images
})
return pages
def analyze_with_vision(self, api_key: str) -> list:
"""用视觉模型分析图片内容"""
import openai
pages = self.extract_with_images()
analyzed_pages = []
for page in pages:
image_descriptions = []
for img in page["images"]:
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": [
{"type": "text", "text": "请描述这张图片的内容,包括图表类型、数据、文字等。"},
{"type": "image_url", "image_url": {
"url": f"data:image/{img['extension']};base64,{img['base64']}"
}}
]}
],
max_tokens=500
)
image_descriptions.append(response.choices[0].message.content)
analyzed_pages.append({
"page_num": page["page_num"],
"text": page["text"],
"image_descriptions": image_descriptions
})
return analyzed_pages
# 使用
mm_parser = MultimodalParser("report_with_charts.pdf")
pages = mm_parser.extract_with_images()
# analyzed = mm_parser.analyze_with_vision("your-api-key") # 需要 API key
1.5 混合解析策略——自动调度器
在实际生产中,一份 PDF 的不同页面可能同时包含文本段、扫描区域和图表。单一解析器往往无法覆盖所有场景,需要一个混合解析调度器来按页自动选择最优解析方式。该调度器的核心思路是先对每页做快速分类,再分发到对应的解析器,最后合并结果。
class HybridPDFParser:
"""混合解析调度器:按页自动选择最优解析器"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.native_parser = NativeTextParser(pdf_path)
self.ocr_parser = OCRParser(pdf_path)
self.multimodal_parser = MultimodalParser(pdf_path)
def parse(self) -> list:
"""按页混合解析"""
# 先做类型检测
pdf_type = detect_pdf_type(self.pdf_path)
if pdf_type == "native_text":
return self.native_parser.extract_structured_text()
elif pdf_type == "scanned":
return self.ocr_parser.extract_text()
else:
# 图文混排:逐页判断,选择最佳解析器
return self._parse_mixed()
def _parse_mixed(self) -> list:
"""逐页解析混合类型 PDF"""
native_pages = self.native_parser.extract_structured_text()
ocr_pages = self.ocr_parser.extract_text()
combined = []
for i, (native, ocr) in enumerate(zip(native_pages, ocr_pages)):
native_text_len = sum(
len(b["text"]) for b in native["blocks"] if b["type"] == "text"
)
ocr_text_len = len(ocr["text"])
# 如果原生解析文本量远超 OCR(2倍以上),选原生
if native_text_len > ocr_text_len * 2:
combined.append(native)
# 如果 OCR 文本量远超原生,说明该页本质是扫描区域
elif ocr_text_len > native_text_len * 2:
combined.append(ocr)
else:
# 两者接近:合并结果,去重
merged = native.copy()
merged["text"] = ocr["text"]
combined.append(merged)
return combined
这个调度器的优势在于避免了"一刀切"的解析方式带来的信息损失。例如一份技术报告,前几页是原生文本目录,中间某页是扫描件影印,后面又是原生文本正文——混合调度器能为每一页独立选择最佳策略,保证整体解析质量。
二、第 2 层:结构还原——保留 PDF 原生结构
2.1 文档结构还原器
class DocumentStructure:
"""文档结构还原"""
def __init__(self):
self.title = ""
self.sections = [] # 章节列表
self.tables = [] # 表格列表
self.figures = [] # 图片列表
self.footnotes = [] # 脚注列表
def add_section(self, level: int, title: str, page_num: int, content: str):
self.sections.append({
"level": level,
"title": title,
"page_num": page_num,
"content": content
})
def add_table(self, table_id: str, page_num: int, caption: str, data: dict):
self.tables.append({
"table_id": table_id,
"page_num": page_num,
"caption": caption,
"data": data
})
def add_figure(self, figure_id: str, page_num: int, caption: str, description: str):
self.figures.append({
"figure_id": figure_id,
"page_num": page_num,
"caption": caption,
"description": description
})
class StructureExtractor:
"""从解析结果中提取文档结构"""
def extract(self, pages: list) -> DocumentStructure:
doc = DocumentStructure()
current_section = None
for page in pages:
for block in page.get("blocks", []):
if block["type"] == "text":
text = block["text"].strip()
# 检测标题(字体大小 > 14)
if block.get("is_heading") and len(text) < 100:
level = 1 if block["font_size"] > 18 else 2
current_section = {
"level": level,
"title": text,
"page_num": page["page_num"],
"content": ""
}
doc.add_section(level, text, page["page_num"], "")
elif current_section:
current_section["content"] += text + "\n"
return doc
# 使用
extractor = StructureExtractor()
doc_structure = extractor.extract(pages)
print(f"提取了 {len(doc_structure.sections)} 个章节")
2.2 表格结构化处理
import pandas as pd
class TableProcessor:
"""表格结构化处理"""
def process_table(self, table_data: dict, table_id: str, page_num: int) -> dict:
"""将表格转为结构化文本"""
df = pd.DataFrame(table_data)
# 生成表格描述
description = f"表格 {table_id}(第 {page_num} 页):\n"
description += f"列名:{', '.join(df.columns.tolist())}\n"
description += f"行数:{len(df)}\n"
description += "数据:\n"
# 逐行描述
for idx, row in df.iterrows():
row_desc = f" 第 {idx + 1} 行:"
row_desc += ", ".join([f"{col}={val}" for col, val in row.items()])
description += row_desc + "\n"
return {
"table_id": table_id,
"page_num": page_num,
"columns": df.columns.tolist(),
"data": table_data,
"structured_text": description
}
def extract_with_camelot(self, pdf_path: str, page_num: int) -> list:
"""使用 camelot 提取表格"""
try:
import camelot
tables = camelot.read_pdf(pdf_path, pages=str(page_num))
results = []
for i, table in enumerate(tables):
results.append({
"table_id": f"table_{page_num}_{i}",
"page_num": page_num,
"data": table.df.to_dict(),
"accuracy": table.parsing_report.get("accuracy", 0)
})
return results
except ImportError:
print("请安装 camelot: pip install camelot-py")
return []
# 使用
processor = TableProcessor()
# structured_table = processor.process_table(table_data, "table_1", 5)
2.3 目录与层级关系提取
长文档通常包含多级目录、交叉引用和脚注等层级关系。忽略这些信息会导致 Agent 在回答"参见第 3.2 节"这类问题时无从下手。一个完整的结构还原器需要建立章节间的前后引用关系,为后续检索提供上下文线索。
class TOCExtractor:
"""目录与章节层级关系提取器"""
def __init__(self, doc: DocumentStructure):
self.doc = doc
def build_section_tree(self) -> dict:
"""构建章节树"""
tree = {"children": [], "title": self.doc.title}
stack = [tree]
for section in self.doc.sections:
node = {
"title": section["title"],
"level": section["level"],
"page_num": section["page_num"],
"children": []
}
# 找到合适的父节点
while len(stack) > 1 and stack[-1].get("level", 0) >= section["level"]:
stack.pop()
stack[-1]["children"].append(node)
stack.append(node)
return tree
def extract_cross_references(self, text: str) -> list:
"""提取交叉引用(如"参见第 3.2 节"、"如表 5-1 所示")"""
import re
refs = []
# 匹配"第 X.Y 节"、"第 X 章"等
section_refs = re.findall(r'第\s*(\d+(?:\.\d+)*)\s*(?:节|章)', text)
refs.extend([{"type": "section", "ref": r} for r in section_refs])
# 匹配"表 X-Y"、"图 X-Y"等
table_refs = re.findall(r'(?:表|图表)\s*(\d+-\d+)', text)
refs.extend([{"type": "table", "ref": r} for r in table_refs])
# 匹配"图 X-Y"
figure_refs = re.findall(r'图\s*(\d+-\d+)', text)
refs.extend([{"type": "figure", "ref": r} for r in figure_refs])
return refs
建立章节树后,Agent 在回答问题时可以根据引用关系跳转到相关章节,而不是仅仅依赖向量相似度。这种结构感知的方式在处理"请对比第 3 章和第 5 章的结论"这类问题时尤其有效。
三、第 3 层:切片和索引——按语义结构切
3.1 语义切片策略
class SemanticChunker:
"""语义切片器"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_structure(self, doc: DocumentStructure) -> list:
"""按文档结构切片"""
chunks = []
for section in doc.sections:
# 标题 + 内容作为一个 chunk
chunk_text = f"# {section['title']}\n\n{section['content']}"
# 如果内容太长,再按段落切
if len(chunk_text) > self.chunk_size:
paragraphs = section['content'].split('\n\n')
current_chunk = f"# {section['title']}\n\n"
for para in paragraphs:
if len(current_chunk) + len(para) < self.chunk_size:
current_chunk += para + "\n\n"
else:
chunks.append(self._create_chunk(current_chunk, section))
current_chunk = f"# {section['title']}\n\n{para}\n\n"
if current_chunk.strip():
chunks.append(self._create_chunk(current_chunk, section))
else:
chunks.append(self._create_chunk(chunk_text, section))
# 表格单独切片
for table in doc.tables:
chunks.append({
"text": table["structured_text"],
"metadata": {
"type": "table",
"table_id": table["table_id"],
"page_num": table["page_num"],
"section": "table"
}
})
return chunks
def _create_chunk, text: str, section: dict) -> dict:
"""创建带 metadata 的 chunk"""
return {
"text": text,
"metadata": {
"type": "text",
"section_title": section["title"],
"section_level": section["level"],
"page_num": section["page_num"],
"context": f"这是文档中'{section['title']}'章节的内容。"
}
}
# 使用
chunker = SemanticChunker(chunk_size=1000, chunk_overlap=200)
chunks = chunker.chunk_by_structure(doc_structure)
print(f"生成了 {len(chunks)} 个 chunk")
3.2 切片策略对比
| 策略 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| 固定长度切片 | 实现简单 | 切断上下文 | 简单文档 |
| 按段落切片 | 保留段落完整性 | 段落长度不一 | 普通文档 |
| 按语义结构切片 | 保留文档结构 | 实现复杂 | 生产级应用 |
| 按章节切片 | 逻辑清晰 | 章节长度差异大 | 长文档 |
3.3 向量索引构建
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
class VectorIndexer:
"""向量索引构建器"""
def __init__(self, embedding_model: str = "text-embedding-3-small"):
self.embeddings = OpenAIEmbeddings(model=embedding_model)
self.vectorstore = None
def build_index(self, chunks: list, collection_name: str = "pdf_rag"):
"""构建向量索引"""
texts = [chunk["text"] for chunk in chunks]
metadatas = [chunk["metadata"] for chunk in chunks]
self.vectorstore = Chroma.from_texts(
texts=texts,
embedding=self.embeddings,
metadatas=metadatas,
collection_name=collection_name
)
return self.vectorstore
def search(self, query: str, top_k: int = 5) -> list:
"""检索相关 chunk"""
if not self.vectorstore:
raise ValueError("请先构建索引")
results = self.vectorstore.similarity_search(query, k=top_k)
return results
def search_with_metadata(self, query: str, filter_dict: dict, top_k: int = 5) -> list:
"""带 metadata 过滤的检索"""
results = self.vectorstore.similarity_search(
query, k=top_k, filter=filter_dict
)
return results
# 使用
indexer = VectorIndexer()
vectorstore = indexer.build_index(chunks)
results = indexer.search("合同期限是多久?")
for r in results:
print(f"[{r.metadata['section_title']}] {r.page_content[:100]}...")
3.4 混合检索——向量 + 关键词 + 知识图谱
单纯依赖向量相似度检索在处理精确匹配查询时表现不佳。例如用户搜索"第 3.2 节提到的 API 限流策略",关键词"3.2"在向量空间中并不具有特殊语义。生产级 PDF RAG 通常需要混合检索策略,将向量检索与关键词检索结合,并通过 RRF(Reciprocal Rank Fusion)或加权评分的方式合并结果。
class HybridSearchEngine:
"""混合检索引擎:向量 + 关键词 + 结构化过滤"""
def __init__(self, vectorstore, chunks: list):
self.vectorstore = vectorstore
self.chunks = chunks
# 构建关键词倒排索引
self.inverted_index = self._build_inverted_index()
def _build_inverted_index(self) -> dict:
"""构建倒排索引"""
from collections import defaultdict
index = defaultdict(set)
for i, chunk in enumerate(self.chunks):
words = chunk["text"].lower().split()
for word in words:
index[word].add(i)
return index
def keyword_search(self, query: str, top_k: int = 5) -> list:
"""关键词搜索(BM25 简化版)"""
query_words = query.lower().split()
scores = {}
for word in query_words:
if word in self.inverted_index:
for chunk_idx in self.inverted_index[word]:
scores[chunk_idx] = scores.get(chunk_idx, 0) + 1
# 按评分排序
sorted_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
return [self.chunks[i] for i in sorted_indices[:top_k]]
def hybrid_search(self, query: str, top_k: int = 5,
vector_weight: float = 0.6,
keyword_weight: float = 0.4) -> list:
"""混合搜索:加权合并向量和关键词结果"""
# 向量检索
vector_results = self.vectorstore.similarity_search(query, k=top_k * 2)
# 关键词检索
keyword_results = self.keyword_search(query, top_k=top_k * 2)
# RRF 融合
chunk_scores = {}
for rank, result in enumerate(vector_results):
chunk_id = result.page_content[:50]
chunk_scores[chunk_id] = chunk_scores.get(chunk_id, 0) + vector_weight / (rank + 1)
for rank, result in enumerate(keyword_results):
chunk_id = result["text"][:50]
chunk_scores[chunk_id] = chunk_scores.get(chunk_id, 0) + keyword_weight / (rank + 1)
# 排序返回
sorted_ids = sorted(chunk_scores.keys(), key=lambda x: chunk_scores[x], reverse=True)
return sorted_ids[:top_k]
混合检索把语义理解能力和精确匹配能力相结合,在工程实践中通常能提升 15% 到 30% 的检索准确率。尤其是针对 PDF 文档中常见的关键词查询(如术语、编号、表格标题等),关键词检索几乎是不可或缺的补充手段。
四、第 4 层:Agent 调用——封装成工具链
4.1 PDF Agent 工具链
from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate
class PDFAgentTools:
"""PDF Agent 工具链"""
def __init__(self, vectorstore, doc_structure: DocumentStructure):
self.vectorstore = vectorstore
self.doc_structure = doc_structure
self.tools = self._create_tools()
def _create_tools(self) -> list:
return [
Tool(
name="search_pdf",
func=self._search_pdf,
description="检索 PDF 中的相关片段。输入:问题"
),
Tool(
name="read_page",
func=self._read_page,
description="读取指定页的完整内容。输入:页码"
),
Tool(
name="extract_table",
func=self._extract_table,
description="提取指定表格。输入:表格ID"
),
Tool(
name="analyze_chart",
func=self._analyze_chart,
description="分析图表内容。输入:图表ID"
),
Tool(
name="quote_source",
func=self._quote_source,
description="返回引用来源。输入:chunk内容"
)
]
def _search_pdf(self, query: str) -> str:
results = self.vectorstore.similarity_search(query, k=5)
return "\n\n".join([
f"[{r.metadata.get('section_title', '未知')}] {r.page_content}"
for r in results
])
def _read_page(self, page_num: int) -> str:
# 从原始文档读取指定页
return f"第 {page_num} 页的完整内容..."
def _extract_table(self, table_id: str) -> str:
for table in self.doc_structure.tables:
if table["table_id"] == table_id:
return table["structured_text"]
return f"未找到表格 {table_id}"
def _analyze_chart(self, figure_id: str) -> str:
for figure in self.doc_structure.figures:
if figure["figure_id"] == figure_id:
return figure["description"]
return f"未找到图表 {figure_id}"
def _quote_source(self, content: str) -> str:
results = self.vectorstore.similarity_search(content, k=1)
if results:
r = results[0]
return f"来源:{r.metadata.get('section_title', '未知')},第 {r.metadata.get('page_num', '?')} 页"
return "未找到来源"
# 使用
tools = PDFAgentTools(vectorstore, doc_structure)
print(f"可用工具:{[t.name for t in tools.tools]}")
4.2 Agent 调用逻辑
def create_pdf_agent(tools, llm):
"""创建 PDF Agent"""
prompt_template = """你是一个 PDF 文档助手,使用以下工具回答用户问题:
{tools}
使用格式:
Question: 用户问题
Thought: 思考过程
Action: 工具名
Action Input: 工具输入
Observation: 工具输出
... (重复)
Final Answer: 最终回答(必须包含引用来源)
Question: {input}
Thought:"""
prompt = PromptTemplate.from_template(prompt_template)
agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
return agent_executor
# 使用
# agent = create_pdf_agent(tools.tools, llm)
# result = agent.invoke({"input": "合同期限是多久?"})
4.3 多轮对话上下文管理
在实际生产环境中,用户很少只问一个问题。一个高效的 PDF RAG 系统需要支持多轮对话上下文——用户在第一轮问"合同期限是多久",在第二轮追问"续约条件是什么"时,Agent 应该能记住上一轮的上下文。
class ConversationManager:
"""多轮对话上下文管理"""
def __init__(self, max_history: int = 5):
self.max_history = max_history
self.history = []
def add_exchange(self, question: str, answer: str):
"""添加一轮对话"""
self.history.append({"question": question, "answer": answer})
# 保留最近的历史
if len(self.history) > self.max_history:
self.history = self.history[-self.max_history:]
def build_context(self, current_question: str) -> str:
"""构建带上下文的提示"""
if not self.history:
return current_question
context = "以下是之前的对话历史:\n"
for i, exchange in enumerate(self.history, 1):
context += f"第 {i} 轮:\n"
context += f" 用户:{exchange['question']}\n"
context += f" 助手:{exchange['answer'][:200]}...\n\n"
context += f"\n基于以上上下文,请回答:{current_question}"
return context
通过显式维护对话历史并将其注入到 Agent 的推理上下文中,系统可以在多轮对话中保持连贯性,避免用户重复描述问题背景,显著提升交互体验。
五、生产级必加细节
5.1 可追溯引用
class CitationManager:
"""引用管理器"""
def add_citations(self, answer: str, sources: list) -> str:
"""在回答中添加引用"""
cited_answer = answer + "\n\n**参考来源:**\n"
for i, source in enumerate(sources, 1):
cited_answer += f"[{i}] {source['section_title']},第 {source['page_num']} 页\n"
return cited_answer
def verify_citation(self, answer: str, source_text: str) -> bool:
"""验证引用是否准确(防止幻觉)"""
# 简单实现:检查回答中的关键信息是否在源文本中
# 实际项目中可以用 NLI 模型做更严格的验证
return True
# 使用
citation_mgr = CitationManager()
cited_answer = citation_mgr.add_citations(answer, sources)
5.2 检索结果重排序(Reranking)
初次检索返回的 top-k 结果中,排名靠后的片段往往与问题关联度不高。在检索和生成之间加一层重排序模型,可以显著提升送入 LLM 上下文的质量。常见的做法是使用 Cross-Encoder(如 cross-encoder/ms-marco-MiniLM-L-6-v2)对候选文档和查询重新打分。
from sentence_transformers import CrossEncoder
class Reranker:
"""检索结果重排序器"""
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.model = CrossEncoder(model_name)
def rerank(self, query: str, candidates: list, top_k: int = 5) -> list:
"""对候选文档重排序"""
pairs = [(query, doc.page_content if hasattr(doc, 'page_content') else doc["text"])
for doc in candidates]
scores = self.model.predict(pairs)
# 按分数排序
scored_candidates = list(zip(candidates, scores))
scored_candidates.sort(key=lambda x: x[1], reverse=True)
return [item[0] for item in scored_candidates[:top_k]]
使用重排序后的效果:初次检索 top-20,经 Reranker 筛选后取 top-5 送入 LLM,既控制了 token 消耗,又保留了最相关的上下文。这对于长文档的检索效果提升尤为明显。
5.3 增量索引与缓存策略
生产环境中 PDF 文档会频繁更新(合同修订、报告更新等),每次都全量重建索引代价太大。合理的做法是引入增量索引和解析结果缓存机制。
import hashlib
import json
import os
class IncrementalIndexer:
"""增量索引管理器"""
def __init__(self, cache_dir: str = ".cache/pdf_rag"):
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def get_file_hash(self, pdf_path: str) -> str:
"""计算文件哈希(用于检测变更)"""
hasher = hashlib.sha256()
with open(pdf_path, "rb") as f:
hasher.update(f.read())
return hasher.hexdigest()
def load_cached(self, pdf_path: str) -> dict:
"""加载缓存的解析结果"""
file_hash = self.get_file_hash(pdf_path)
cache_file = os.path.join(self.cache_dir, f"{file_hash}.json")
if os.path.exists(cache_file):
with open(cache_file, "r", encoding="utf-8") as f:
return json.load(f)
return None
def save_cache(self, pdf_path: str, data: dict):
"""缓存解析结果"""
file_hash = self.get_file_hash(pdf_path)
cache_file = os.path.join(self.cache_dir, f"{file_hash}.json")
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def should_rebuild(self, pdf_path: str) -> bool:
"""判断是否需要重建索引"""
cached = self.load_cached(pdf_path)
return cached is None # 缓存不存在则需重建
这套缓存机制的思路是:只要 PDF 文件没有变化(SHA-256 哈希不变),就直接使用之前缓存的解析结果、结构还原和切片数据,仅在文件发生变化时才重新跑整个解析流程。这对于包含大量 PDF 的知识库运维场景,能节省大量重复计算开销。
5.4 评测闭环
class RAGEvaluator:
"""RAG 评测器"""
def evaluate(self, test_cases: list, agent) -> dict:
"""评测 RAG 系统"""
results = {
"retrieval_accuracy": 0, # 检索片段是否命中原文
"page_accuracy": 0, # 页码是否正确
"table_accuracy": 0, # 表格是否解析准确
"ocr_accuracy": 0, # OCR 是否漏字
"chart_accuracy": 0, # 图表信息是否正确
"overall_score": 0
}
for case in test_cases:
# 运行 Agent
answer = agent.invoke({"input": case["question"]})
# 评测检索准确性
if case["expected_section"] in answer:
results["retrieval_accuracy"] += 1
# 评测页码准确性
if str(case["expected_page"]) in answer:
results["page_accuracy"] += 1
# 计算平均分
n = len(test_cases)
for key in results:
if key != "overall_score":
results[key] /= n
results["overall_score"] = sum(
results[k] for k in results if k != "overall_score"
) / 5
return results
# 使用
evaluator = RAGEvaluator()
test_cases = [
{"question": "合同期限是多久?", "expected_section": "合同期限", "expected_page": 3},
{"question": "年增长率是多少?", "expected_section": "财务数据", "expected_page": 12},
]
# results = evaluator.evaluate(test_cases, agent)
六、技术选型总结
| 层级 | 推荐工具 | 备选方案 |
|---|---|---|
| 原生文本解析 | PyMuPDF | pdfplumber, PyPDF2 |
| OCR | Tesseract + pytesseract | PaddleOCR, EasyOCR |
| 多模态解析 | GPT-4o / Claude | Gemini, 本地模型 |
| 表格提取 | Camelot | Tabula, pdfplumber |
| 向量数据库 | Chroma / Pinecone | Weaviate, Milvus |
| Embedding | text-embedding-3-small | Cohere, 本地模型 |
| Agent 框架 | LangChain | LlamaIndex, 自建 |
| 重排序 | Cross-Encoder | Cohere Rerank, bge-reranker |
七、总结
| 层级 | 核心 | 关键产出 |
|---|---|---|
| 解析层 | 先分类,再解析 | 文本 + 图片 + 表格 |
| 结构还原 | 保留原生结构 | 章节 + 表格 + 图片 |
| 切片索引 | 按语义切,带 metadata | 高质量 chunk + 向量索引 |
| Agent 调用 | 封装工具链,按需调用 | 可追溯的回答 |
一句话总结: 生产级 PDF RAG 不是"PDF 转文本再向量化",而是四层架构的完整工程——解析、还原、切片、调用,每个环节都需要精心设计。混合解析策略解决了不同类型页面的一体化处理问题,混合检索与重排序提升了召回准确率,增量索引与缓存让系统在大规模场景下保持高效运转。这四层加上检索增强和评测闭环,构成了一个真正可用的 PDF 知识助手系统。
📌 关联阅读:
- 面试回答:面试官问你:RAG 如何处理 PDF——别再说"转文本切片"了
- Prompt Engineering 系列:Prompt Engineering 进阶:从"写指令"到"设计可迭代的提示词系统"