diff --git a/chat_web_front/src/views/reading/index.vue b/chat_web_front/src/views/reading/index.vue
index 37920b3..d4c62c7 100644
--- a/chat_web_front/src/views/reading/index.vue
+++ b/chat_web_front/src/views/reading/index.vue
@@ -907,13 +907,6 @@ onMounted(async () => {
flex: 1; overflow: auto; position: relative; padding: 0;
.view-md {
padding: 20px;
- // 覆盖 PyMuPDF get_text("html") 输出的固定宽度
- :deep(div) { max-width: 100% !important; }
- :deep(div[style*="width:"]) { width: auto !important; max-width: 100% !important; }
- :deep(.pdf-page) { max-width: 100% !important; }
- :deep(.pdf-page > div) { width: auto !important; max-width: 100% !important; }
- :deep(.pdf-preview) { max-width: 100% !important; }
- :deep(section) { max-width: 100% !important; }
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
:deep(.highlight) { background: #D0EAC8; }
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
diff --git a/langchain-chat/server/knowledge_base/file_converter.py b/langchain-chat/server/knowledge_base/file_converter.py
index b75d5bb..0389d0b 100644
--- a/langchain-chat/server/knowledge_base/file_converter.py
+++ b/langchain-chat/server/knowledge_base/file_converter.py
@@ -915,8 +915,13 @@ class FileConverter:
parts.append("
")
return "".join(parts) if parts else '
(本页无文本内容)
' + @staticmethod + def _escape_html(text: str) -> str: + """HTML 转义""" + return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: - """PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。""" + """PDF 预览:使用 pdfplumber 提取文本和表格,生成干净的 HTML。""" allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT) abs_input = os.path.abspath(input_path) if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep): @@ -930,35 +935,82 @@ class FileConverter: return "不是 PDF 文件" try: + import pdfplumber + import re + sections: list[str] = [] any_text = False - with fitz.open(abs_input) as doc: - for i in range(len(doc)): - page = doc.load_page(i) - # 使用 get_text("html") 保留格式和图片(base64内嵌) - page_html = (page.get_text("html") or "").strip() - if page_html: + + with pdfplumber.open(abs_input) as pdf: + for i, page in enumerate(pdf.pages): + page_parts: list[str] = [] + + # 提取表格 + tables = page.extract_tables() + table_bboxes = [] + if tables: + for tbl_settings in page.find_tables(): + table_bboxes.append(tbl_settings.bbox) + + # 提取文本(排除表格区域的文本) + text = page.extract_text() or "" + + if text.strip(): any_text = True + # 按行处理文本,识别标题 + lines = text.split('\n') + for line in lines: + line = line.strip() + if not line: + continue + # 简单的标题检测:短行 + 无标点结尾 + is_heading = (len(line) < 40 and not line.endswith(('。', ',', ';', '、', ':', ',', '.', ';')) + and not line.startswith(('(', '(')) + and re.match(r'^[一二三四五六七八九十\d]+[、..]', line)) + if is_heading: + escaped = self._escape_html(line) + page_parts.append(f'{escaped}
') + + # 渲染表格 + for table in tables: + if not table: + continue + page_parts.append('(未能从 PDF 提取到文本,可能是扫描件或加密文档。)
(未能从 PDF 提取到文本,可能是扫描件或加密文档。)
' ) else: wrapper = ( diff --git a/langchain-chat/server/knowledge_base/kb_doc_api.py b/langchain-chat/server/knowledge_base/kb_doc_api.py index e0d6ba9..6364b86 100644 --- a/langchain-chat/server/knowledge_base/kb_doc_api.py +++ b/langchain-chat/server/knowledge_base/kb_doc_api.py @@ -4,9 +4,10 @@ import urllib from fastapi import File, Form, Body, Query, Response, UploadFile from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, - EXPR, + EXPR, CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE, logger, log_verbose, POLICY_KNOWLEDGE_BASE) +from configs.kb_config import ck_mysql_config from configs.model_config import LLM_MODELS from server.knowledge_base.cleanpdf import PdfConverter from server.knowledge_base.file_converter import FileConverter