From f508a8b6b12baba9c5d667b3d7df3ef50cd22ac8 Mon Sep 17 00:00:00 2001 From: liuguancen Date: Thu, 2 Apr 2026 13:24:41 +0800 Subject: [PATCH] =?UTF-8?q?[=E5=89=8D=E7=AB=AF+RAG]=20PDF=E5=8E=9F?= =?UTF-8?q?=E7=94=9F=E6=B8=B2=E6=9F=93(pdfjs=20text=20layer)=EF=BC=9BExcel?= =?UTF-8?q?=E5=88=97=E5=AE=BD=E4=BC=98=E5=8C=96+=E6=B0=B4=E5=B9=B3?= =?UTF-8?q?=E6=BB=9A=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chat_web_front/package.json | 3 +- chat_web_front/src/components/PdfViewer.vue | 184 ++++++++++++++++++ chat_web_front/src/views/reading/index.vue | 63 +++++- .../server/knowledge_base/file_converter.py | 38 +++- 4 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 chat_web_front/src/components/PdfViewer.vue diff --git a/chat_web_front/package.json b/chat_web_front/package.json index 7a8dc29..1b4533f 100644 --- a/chat_web_front/package.json +++ b/chat_web_front/package.json @@ -35,6 +35,7 @@ "markdown-it-sub": "^2.0.0", "markdown-it-sup": "^2.0.0", "normalize.css": "^8.0.1", + "pdfjs-dist": "^3.11.174", "pinia": "^2.2.6", "sa-sdk-javascript": "1.27.2", "sass": "^1.81.0", @@ -50,8 +51,8 @@ }, "devDependencies": { "@tsconfig/node22": "^22.0.0", - "@types/markdown-it": "^14.1.2", "@types/mark.js": "^8.11.12", + "@types/markdown-it": "^14.1.2", "@types/node": "^22.9.0", "@vitejs/plugin-vue": "^5.1.4", "@vue/tsconfig": "^0.5.1", diff --git a/chat_web_front/src/components/PdfViewer.vue b/chat_web_front/src/components/PdfViewer.vue new file mode 100644 index 0000000..6c3680d --- /dev/null +++ b/chat_web_front/src/components/PdfViewer.vue @@ -0,0 +1,184 @@ + + + + + diff --git a/chat_web_front/src/views/reading/index.vue b/chat_web_front/src/views/reading/index.vue index b5b62d9..612851a 100644 --- a/chat_web_front/src/views/reading/index.vue +++ b/chat_web_front/src/views/reading/index.vue @@ -103,7 +103,12 @@
{{ selectedFile.fileName }}
-
+ +
+ +
+ +
@@ -207,6 +212,7 @@ import {withLoading} from "@/utils/loading"; import {copyToClip, getGlobalSelectionPosition} from "@/utils"; import {transforMd} from "@/utils/markdown"; import ReadingBox from "@/components/ReadingBox.vue"; +import PdfViewer from "@/components/PdfViewer.vue"; import Loading from "@/components/Loading.vue"; import {UploadFilled} from '@element-plus/icons-vue'; import {ElMessage, ElMessageBox, type UploadFile, type UploadFiles} from "element-plus"; @@ -271,6 +277,11 @@ provide('selectedFile', selectedFile); const docHtml = ref(''); const fileContent = ref(null); const readingBox = ref(null); +const pdfData = ref(null); +const fileType = computed(() => { + const name = selectedFile.value?.fileName || ''; + return name.split('.').pop()?.toLowerCase() || ''; +}); // ===================== 笔记 ===================== const fileNote = reactive({ notes: [] as any[] }); @@ -412,7 +423,55 @@ const handleNodeClick = async (data: any) => { articleParagraph: doc.articleParagraph || '暂无内容,请重试', fullContent: doc.context }; - await loadFileContent(); + // 根据文件类型加载内容 + const ext = doc.filename?.split('.').pop()?.toLowerCase() || ''; + if (ext === 'pdf') { + await loadPdfFile(); + } else { + pdfData.value = null; + await loadFileContent(); + } +}; + +const loadPdfFile = async () => { + if (!selectedFile.value) return; + docHtml.value = ''; + try { + const blob = await downloadFile({ fileId: selectedFile.value.fileId }); + const arrayBuffer = await (blob as Blob).arrayBuffer(); + pdfData.value = arrayBuffer; + } catch (e: any) { + pdfData.value = null; + docHtml.value = '

PDF 文件加载失败

'; + } + // 同时加载 HTML 用于笔记功能(后台) + try { + let res = await getFileContent({ + fileId: selectedFile.value.fileId, + embeddingId: selectedFile.value.embeddingId, + knowledgeBaseId: selectedFile.value.folderId + }); + if (res?.code === 200 && res.data) { + fileNote.notes = res.data.notes || []; + } + } catch {} + // 绑定 PDF text layer 的选择事件 + await nextTick(); + setTimeout(() => { + if (fileContent.value) { + fileContent.value.addEventListener('mouseup', (event: MouseEvent) => { + setTimeout(() => { + const sel = window.getSelection(); if (!sel) return; + selectText.value = sel.toString(); + if (selectText.value && shortMenuDom.value) { + shortMenuShow.value = true; + (shortMenuDom.value as HTMLElement).style.left = event.clientX + 'px'; + (shortMenuDom.value as HTMLElement).style.top = event.clientY + 'px'; + } + }); + }); + } + }, 500); }; const handleCheckChange = () => { diff --git a/langchain-chat/server/knowledge_base/file_converter.py b/langchain-chat/server/knowledge_base/file_converter.py index b738573..b3c8634 100644 --- a/langchain-chat/server/knowledge_base/file_converter.py +++ b/langchain-chat/server/knowledge_base/file_converter.py @@ -1070,13 +1070,43 @@ class FileConverter: def xlsx_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: try: import openpyxl + from openpyxl.utils import get_column_letter wb = openpyxl.load_workbook(input_path, data_only=True) - style = '''''' + style = '''''' html = [] for idx, sheet in enumerate(wb.worksheets): html.append(f'

Sheet {idx+1}: {sheet.title}

') - html.append('
') - html.append('') + # 计算列宽 + col_widths = {} + for col_idx in range(1, sheet.max_column + 1): + col_letter = get_column_letter(col_idx) + dim = sheet.column_dimensions.get(col_letter) + if dim and dim.width and dim.width > 0: + # openpyxl width 以字符数为单位,约 7px/字符 + col_widths[col_idx] = max(60, int(dim.width * 7.5)) + else: + # 根据内容估算宽度 + max_len = 8 + for row_idx in range(1, min(sheet.max_row + 1, 50)): + cell = sheet.cell(row=row_idx, column=col_idx) + if cell.value is not None: + max_len = max(max_len, len(str(cell.value))) + col_widths[col_idx] = max(60, min(300, max_len * 9)) + + html.append('
') + html.append('
') + # colgroup 设置列宽 + html.append('') + for col_idx in range(1, sheet.max_column + 1): + w = col_widths.get(col_idx, 80) + html.append(f'') + html.append('') + merged_map = {} for r in sheet.merged_cells.ranges: min_row, min_col, max_row, max_col = r.min_row, r.min_col, r.max_row, r.max_col @@ -1099,6 +1129,8 @@ class FileConverter: td_attrs += f' rowspan="{rowspan}"' if colspan > 1: td_attrs += f' colspan="{colspan}"' + # 合并单元格允许换行 + style_str += 'white-space:normal;word-wrap:break-word;' html.append(f'{cell_value}') html.append('') html.append('
')