[RAG] PDF阅读模式改用pdfplumber(文本+表格提取，干净HTML)；修复ck_mysql_config导入

2026-04-02 16:18:59 +08:00
parent 05e33d1d05
commit 46428b7936
3 changed files with 72 additions and 26 deletions
--- a/chat_web_front/src/views/reading/index.vue
+++ b/chat_web_front/src/views/reading/index.vue
@@ -907,13 +907,6 @@ onMounted(async () => {
      flex: 1; overflow: auto; position: relative; padding: 0;
      .view-md {
        padding: 20px;
        // 覆盖 PyMuPDF get_text("html") 输出的固定宽度
        :deep(div) { max-width: 100% !important; }
        :deep(div[style*="width:"]) { width: auto !important; max-width: 100% !important; }
        :deep(.pdf-page) { max-width: 100% !important; }
        :deep(.pdf-page > div) { width: auto !important; max-width: 100% !important; }
        :deep(.pdf-preview) { max-width: 100% !important; }
        :deep(section) { max-width: 100% !important; }
        :deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
        :deep(.highlight) { background: #D0EAC8; }
        :deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
--- a/langchain-chat/server/knowledge_base/file_converter.py
+++ b/langchain-chat/server/knowledge_base/file_converter.py
@@ -915,8 +915,13 @@ class FileConverter:
                parts.append("<br/>")
        return "".join(parts) if parts else '<p><em>（本页无文本内容）</em></p>'
    @staticmethod
    def _escape_html(text: str) -> str:
        """HTML 转义"""
        return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
    def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
-        """PDF 预览：使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。"""
+        """PDF 预览：使用 pdfplumber 提取文本和表格，生成干净的 HTML。"""
        allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
        abs_input = os.path.abspath(input_path)
        if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
@@ -930,35 +935,82 @@ class FileConverter:
            return "不是 PDF 文件"
        try:
            import pdfplumber
            import re
            sections: list[str] = []
            any_text = False
-            with fitz.open(abs_input) as doc:
+
-                for i in range(len(doc)):
+            with pdfplumber.open(abs_input) as pdf:
-                    page = doc.load_page(i)
+                for i, page in enumerate(pdf.pages):
-                    # 使用 get_text("html") 保留格式和图片（base64内嵌）
+                    page_parts: list[str] = []
-                    page_html = (page.get_text("html") or "").strip()
+
-                    if page_html:
+                    # 提取表格
                    tables = page.extract_tables()
                    table_bboxes = []
                    if tables:
                        for tbl_settings in page.find_tables():
                            table_bboxes.append(tbl_settings.bbox)
                    # 提取文本（排除表格区域的文本）
                    text = page.extract_text() or ""
                    if text.strip():
                        any_text = True
                        # 按行处理文本，识别标题
                        lines = text.split('\n')
                        for line in lines:
                            line = line.strip()
                            if not line:
                                continue
                            # 简单的标题检测：短行 + 无标点结尾
                            is_heading = (len(line) < 40 and not line.endswith(('。', '，', '；', '、', '：', ',', '.', ';'))
                                          and not line.startswith(('（', '('))
                                          and re.match(r'^[一二三四五六七八九十\d]+[、.．]', line))
                            if is_heading:
                                escaped = self._escape_html(line)
                                page_parts.append(f'<h3>{escaped}</h3>')
                            else:
                                escaped = self._escape_html(line)
                                page_parts.append(f'<p>{escaped}</p>')
                    # 渲染表格
                    for table in tables:
                        if not table:
                            continue
                        page_parts.append('<table class="pdf-table">')
                        for row_idx, row in enumerate(table):
                            page_parts.append('<tr>')
                            tag = 'th' if row_idx == 0 else 'td'
                            for cell in row:
                                cell_text = self._escape_html(str(cell)) if cell is not None else ''
                                page_parts.append(f'<{tag}>{cell_text}</{tag}>')
                            page_parts.append('</tr>')
                        page_parts.append('</table>')
                    page_html = '\n'.join(page_parts)
                    sections.append(
-                        f'<section class="pdf-page" data-page="{i + 1}" '
+                        f'<section class="pdf-page" data-page="{i + 1}">'
-                        'style="margin-bottom:1.5em;padding-bottom:1em;border-bottom:1px solid #e5e5e5;">'
+                        f'<div class="pdf-page-num">第 {i + 1} 页</div>'
-                        f'<div style="font-size:12px;color:#888;margin-bottom:8px;">第 {i + 1} 页</div>'
+                        f'{page_html}</section>'
                        f"{page_html}</section>"
                    )
            css = '''<style>
-.pdf-preview { font-family: system-ui, -apple-system, sans-serif; line-height: 1.6; max-width: 100%; }
+.pdf-preview { font-family: "PingFang SC", "Microsoft YaHei", system-ui, sans-serif; line-height: 1.8; color: #333; }
-.pdf-preview > div { width: 100% !important; max-width: 100% !important; }
+.pdf-page { margin-bottom: 1.5em; padding-bottom: 1em; border-bottom: 1px solid #e5e5e5; }
-.pdf-preview img { max-width: 100%; height: auto; }
+.pdf-page-num { font-size: 12px; color: #999; margin-bottom: 8px; }
-.pdf-preview p { margin: 0.3em 0; }
+.pdf-preview p { margin: 0.3em 0; font-size: 15px; text-indent: 0; }
-.pdf-preview span { line-height: 1.5; }
+.pdf-preview h3 { font-size: 16px; font-weight: bold; margin: 1em 0 0.4em; color: #222; }
-.pdf-page > div { width: 100% !important; max-width: 100% !important; }
+.pdf-table { border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 14px; }
 .pdf-table th, .pdf-table td { border: 1px solid #d0d0d0; padding: 6px 10px; text-align: left; vertical-align: top; }
 .pdf-table th { background: #f5f7fa; font-weight: bold; }
 .pdf-table tr:nth-child(even) { background: #fafbff; }
 </style>'''
            if not any_text:
                wrapper = (
                    f'{css}<div class="pdf-preview">'
-                    "<p><em>（未能从 PDF 提取到文本，可能是扫描件或加密文档。）</em></p></div>"
+                    '<p><em>（未能从 PDF 提取到文本，可能是扫描件或加密文档。）</em></p></div>'
                )
            else:
                wrapper = (
--- a/langchain-chat/server/knowledge_base/kb_doc_api.py
+++ b/langchain-chat/server/knowledge_base/kb_doc_api.py
@@ -4,9 +4,10 @@ import urllib
 from fastapi import File, Form, Body, Query, Response, UploadFile
 from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
                     VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
-                     EXPR, 
+                     EXPR,
                     CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
                     logger, log_verbose, POLICY_KNOWLEDGE_BASE)
 from configs.kb_config import ck_mysql_config
 from configs.model_config import LLM_MODELS
 from server.knowledge_base.cleanpdf import PdfConverter
 from server.knowledge_base.file_converter import FileConverter