From 379187f675712eda78cb5465111053b2e8c6667c Mon Sep 17 00:00:00 2001 From: liuguancen Date: Thu, 2 Apr 2026 14:21:41 +0800 Subject: [PATCH] =?UTF-8?q?[=E5=89=8D=E7=AB=AF+RAG]=20=E6=B7=BB=E5=8A=A0PD?= =?UTF-8?q?F=E9=98=85=E8=AF=BB=E6=A8=A1=E5=BC=8F=E5=88=87=E6=8D=A2(?= =?UTF-8?q?=E9=A2=84=E8=A7=88/=E7=AC=94=E8=AE=B0)=EF=BC=9B=E6=94=B9?= =?UTF-8?q?=E8=BF=9BPDF=E2=86=92HTML=E7=94=A8get=5Ftext(html)=E4=BF=9D?= =?UTF-8?q?=E7=95=99=E6=A0=BC=E5=BC=8F=EF=BC=9B=E4=BF=AE=E5=A4=8D=E9=95=BF?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=90=8D=E6=93=8D=E4=BD=9C=E8=8F=9C=E5=8D=95?= =?UTF-8?q?=E4=B8=8D=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chat_web_front/src/views/reading/index.vue | 43 ++++++++++++++++--- .../server/knowledge_base/file_converter.py | 23 ++++++---- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/chat_web_front/src/views/reading/index.vue b/chat_web_front/src/views/reading/index.vue index bdd6371..6526d50 100644 --- a/chat_web_front/src/views/reading/index.vue +++ b/chat_web_front/src/views/reading/index.vue @@ -102,12 +102,17 @@
{{ selectedFile.fileName }} +
+ 预览 + | + 阅读(笔记) +
-
+
- +
@@ -279,11 +284,23 @@ const docHtml = ref(''); const fileContent = ref(null); const readingBox = ref(null); const pdfData = ref(null); +const readingMode = ref(false); const fileType = computed(() => { const name = selectedFile.value?.fileName || ''; return name.split('.').pop()?.toLowerCase() || ''; }); +const switchToReadingMode = async () => { + readingMode.value = true; + // 如果还没加载 HTML 内容,加载一下 + if (!docHtml.value) { + await loadFileContent(); + } + await nextTick(); + bindFileContentEvents(); + handelNoteFlagMouseEvent(); +}; + // ===================== 笔记 ===================== const fileNote = reactive({ notes: [] as any[] }); const noteContent = ref(''); @@ -425,6 +442,7 @@ const handleNodeClick = async (data: any) => { fullContent: doc.context }; // 根据文件类型加载内容 + readingMode.value = false; const ext = doc.filename?.split('.').pop()?.toLowerCase() || ''; if (ext === 'pdf') { await loadPdfFile(); @@ -456,6 +474,14 @@ const loadPdfFile = async () => { }); if (res?.code === 200 && res.data) { fileNote.notes = res.data.notes || []; + if (res.data.content) { + // 保存 HTML 内容供阅读模式使用 + let content = res.data.content; + content = content.replace(pattern, (match: string, _cg: string, offset: number) => { + return transforMd(match); + }); + docHtml.value = content.replace(/

(.*?.*?<\/span>.*?)<\/p>/g, '$1'); + } } } catch {} // 绑定 PDF text layer 的选择事件 @@ -836,7 +862,7 @@ onMounted(async () => { .tree-file-icon { font-size: 13px; margin-right: 5px; } .tree-label { font-size: 13px; color: #333; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; cursor: pointer; } } - .tree-node-actions { opacity: 0; margin-left: 4px; .tree-operate-icon { width: 10px; cursor: pointer; padding: 4px; } } + .tree-node-actions { opacity: 0; margin-left: 4px; flex-shrink: 0; .tree-operate-icon { width: 10px; cursor: pointer; padding: 4px; } } } } @@ -867,8 +893,15 @@ onMounted(async () => { .center-content { flex: 1; display: flex; flex-direction: column; height: 100%; overflow: hidden; .center-header { - padding: 12px 20px; border-bottom: 1px solid #E6EDFF; - .center-title { font-weight: bold; font-size: 15px; color: #000; } + padding: 10px 20px; border-bottom: 1px solid #E6EDFF; + display: flex; justify-content: space-between; align-items: center; + .center-title { font-weight: bold; font-size: 15px; color: #000; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; min-width: 0; } + .view-mode-toggle { + flex-shrink: 0; margin-left: 12px; font-size: 13px; color: #999; + span { cursor: pointer; padding: 2px 6px; border-radius: 3px; } + span.active { color: #004EA0; font-weight: bold; background: #E8F0FE; } + .mode-sep { cursor: default; color: #ddd; padding: 0 2px; } + } } .file-content { flex: 1; overflow: auto; position: relative; padding: 0; diff --git a/langchain-chat/server/knowledge_base/file_converter.py b/langchain-chat/server/knowledge_base/file_converter.py index b3c8634..9fca692 100644 --- a/langchain-chat/server/knowledge_base/file_converter.py +++ b/langchain-chat/server/knowledge_base/file_converter.py @@ -916,7 +916,7 @@ class FileConverter: return "".join(parts) if parts else '

(本页无文本内容)

' def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: - """PDF 预览:本进程内 PyMuPDF 抽文本生成 HTML,不调用外部 /convert 微服务。""" + """PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。""" allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT) abs_input = os.path.abspath(input_path) if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep): @@ -935,27 +935,32 @@ class FileConverter: with fitz.open(abs_input) as doc: for i in range(len(doc)): page = doc.load_page(i) - raw = (page.get_text() or "").strip() - if raw: + # 使用 get_text("html") 保留格式和图片(base64内嵌) + page_html = (page.get_text("html") or "").strip() + if page_html: any_text = True - inner = self._pdf_plain_text_to_html(raw) sections.append( f'
' f'
第 {i + 1} 页
' - f"{inner}
" + f"{page_html}" ) + css = '''''' + if not any_text: wrapper = ( - '
' + f'{css}
' "

(未能从 PDF 提取到文本,可能是扫描件或加密文档。)

" ) else: wrapper = ( - '
' + f'{css}
' f"{''.join(sections)}
" ) return self._save_html(f"{wrapper}", output_path)