From 46428b7936969994aab65ea7179095634f36bb68 Mon Sep 17 00:00:00 2001 From: liuguancen Date: Thu, 2 Apr 2026 16:18:59 +0800 Subject: [PATCH] =?UTF-8?q?[RAG]=20PDF=E9=98=85=E8=AF=BB=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E6=94=B9=E7=94=A8pdfplumber(=E6=96=87=E6=9C=AC+=E8=A1=A8?= =?UTF-8?q?=E6=A0=BC=E6=8F=90=E5=8F=96=EF=BC=8C=E5=B9=B2=E5=87=80HTML)?= =?UTF-8?q?=EF=BC=9B=E4=BF=AE=E5=A4=8Dck=5Fmysql=5Fconfig=E5=AF=BC?= =?UTF-8?q?=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chat_web_front/src/views/reading/index.vue | 7 -- .../server/knowledge_base/file_converter.py | 88 +++++++++++++++---- .../server/knowledge_base/kb_doc_api.py | 3 +- 3 files changed, 72 insertions(+), 26 deletions(-) diff --git a/chat_web_front/src/views/reading/index.vue b/chat_web_front/src/views/reading/index.vue index 37920b3..d4c62c7 100644 --- a/chat_web_front/src/views/reading/index.vue +++ b/chat_web_front/src/views/reading/index.vue @@ -907,13 +907,6 @@ onMounted(async () => { flex: 1; overflow: auto; position: relative; padding: 0; .view-md { padding: 20px; - // 覆盖 PyMuPDF get_text("html") 输出的固定宽度 - :deep(div) { max-width: 100% !important; } - :deep(div[style*="width:"]) { width: auto !important; max-width: 100% !important; } - :deep(.pdf-page) { max-width: 100% !important; } - :deep(.pdf-page > div) { width: auto !important; max-width: 100% !important; } - :deep(.pdf-preview) { max-width: 100% !important; } - :deep(section) { max-width: 100% !important; } :deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; } :deep(.highlight) { background: #D0EAC8; } :deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; } diff --git a/langchain-chat/server/knowledge_base/file_converter.py b/langchain-chat/server/knowledge_base/file_converter.py index b75d5bb..0389d0b 100644 --- a/langchain-chat/server/knowledge_base/file_converter.py +++ b/langchain-chat/server/knowledge_base/file_converter.py @@ -915,8 +915,13 @@ class FileConverter: parts.append("
") return "".join(parts) if parts else '

(本页无文本内容)

' + @staticmethod + def _escape_html(text: str) -> str: + """HTML 转义""" + return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: - """PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。""" + """PDF 预览:使用 pdfplumber 提取文本和表格,生成干净的 HTML。""" allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT) abs_input = os.path.abspath(input_path) if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep): @@ -930,35 +935,82 @@ class FileConverter: return "不是 PDF 文件" try: + import pdfplumber + import re + sections: list[str] = [] any_text = False - with fitz.open(abs_input) as doc: - for i in range(len(doc)): - page = doc.load_page(i) - # 使用 get_text("html") 保留格式和图片(base64内嵌) - page_html = (page.get_text("html") or "").strip() - if page_html: + + with pdfplumber.open(abs_input) as pdf: + for i, page in enumerate(pdf.pages): + page_parts: list[str] = [] + + # 提取表格 + tables = page.extract_tables() + table_bboxes = [] + if tables: + for tbl_settings in page.find_tables(): + table_bboxes.append(tbl_settings.bbox) + + # 提取文本(排除表格区域的文本) + text = page.extract_text() or "" + + if text.strip(): any_text = True + # 按行处理文本,识别标题 + lines = text.split('\n') + for line in lines: + line = line.strip() + if not line: + continue + # 简单的标题检测:短行 + 无标点结尾 + is_heading = (len(line) < 40 and not line.endswith(('。', ',', ';', '、', ':', ',', '.', ';')) + and not line.startswith(('(', '(')) + and re.match(r'^[一二三四五六七八九十\d]+[、..]', line)) + if is_heading: + escaped = self._escape_html(line) + page_parts.append(f'

{escaped}

') + else: + escaped = self._escape_html(line) + page_parts.append(f'

{escaped}

') + + # 渲染表格 + for table in tables: + if not table: + continue + page_parts.append('') + for row_idx, row in enumerate(table): + page_parts.append('') + tag = 'th' if row_idx == 0 else 'td' + for cell in row: + cell_text = self._escape_html(str(cell)) if cell is not None else '' + page_parts.append(f'<{tag}>{cell_text}') + page_parts.append('') + page_parts.append('
') + + page_html = '\n'.join(page_parts) sections.append( - f'
' - f'
第 {i + 1} 页
' - f"{page_html}
" + f'
' + f'
第 {i + 1} 页
' + f'{page_html}
' ) css = '''''' if not any_text: wrapper = ( f'{css}
' - "

(未能从 PDF 提取到文本,可能是扫描件或加密文档。)

" + '

(未能从 PDF 提取到文本,可能是扫描件或加密文档。)

' ) else: wrapper = ( diff --git a/langchain-chat/server/knowledge_base/kb_doc_api.py b/langchain-chat/server/knowledge_base/kb_doc_api.py index e0d6ba9..6364b86 100644 --- a/langchain-chat/server/knowledge_base/kb_doc_api.py +++ b/langchain-chat/server/knowledge_base/kb_doc_api.py @@ -4,9 +4,10 @@ import urllib from fastapi import File, Form, Body, Query, Response, UploadFile from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, - EXPR, + EXPR, CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE, logger, log_verbose, POLICY_KNOWLEDGE_BASE) +from configs.kb_config import ck_mysql_config from configs.model_config import LLM_MODELS from server.knowledge_base.cleanpdf import PdfConverter from server.knowledge_base.file_converter import FileConverter