From 8b7e3a726b71af690d15c95fbaf09c3f72ea43be Mon Sep 17 00:00:00 2001 From: liuguancen Date: Thu, 2 Apr 2026 14:44:49 +0800 Subject: [PATCH] =?UTF-8?q?[=E5=89=8D=E7=AB=AF+RAG]=20=E6=81=A2=E5=A4=8D?= =?UTF-8?q?=E5=90=8C=E6=AD=A5=E4=B8=8A=E4=BC=A0=E4=BF=AE=E5=A4=8D=E5=AF=BC?= =?UTF-8?q?=E8=AF=BB=E7=94=9F=E6=88=90=EF=BC=9B=E7=94=A8Uint8Array?= =?UTF-8?q?=E5=AD=98PDF=E5=AD=97=E8=8A=82=E4=BF=AE=E5=A4=8Ddetached?= =?UTF-8?q?=EF=BC=9BCSS=E8=A6=86=E7=9B=96PDF=E9=98=85=E8=AF=BB=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E7=A9=BA=E7=99=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chat_web_front/src/views/reading/index.vue | 22 +-- .../server/knowledge_base/kb_doc_api.py | 129 ++++++------------ 2 files changed, 57 insertions(+), 94 deletions(-) diff --git a/chat_web_front/src/views/reading/index.vue b/chat_web_front/src/views/reading/index.vue index 4bc023b..8fff702 100644 --- a/chat_web_front/src/views/reading/index.vue +++ b/chat_web_front/src/views/reading/index.vue @@ -283,7 +283,14 @@ provide('selectedFile', selectedFile); const docHtml = ref(''); const fileContent = ref(null); const readingBox = ref(null); -const pdfData = ref(null); +const pdfBytes = ref(null); // 存原始字节,不会被 detach +const pdfData = computed(() => { + // 每次访问时复制一份新的 ArrayBuffer 给 PdfViewer + if (!pdfBytes.value) return null; + const copy = new ArrayBuffer(pdfBytes.value.byteLength); + new Uint8Array(copy).set(pdfBytes.value); + return copy; +}); const readingMode = ref(false); const fileType = computed(() => { const name = selectedFile.value?.fileName || ''; @@ -447,7 +454,7 @@ const handleNodeClick = async (data: any) => { if (ext === 'pdf') { await loadPdfFile(); } else { - pdfData.value = null; + pdfBytes.value = null; await loadFileContent(); } }; @@ -460,13 +467,9 @@ const loadPdfFile = async () => { params: { fileId: selectedFile.value.fileId }, responseType: 'arraybuffer' }); - // 复制 ArrayBuffer 避免被 Vue 响应式代理导致 detached - const src = resp.data as ArrayBuffer; - const copy = new ArrayBuffer(src.byteLength); - new Uint8Array(copy).set(new Uint8Array(src)); - pdfData.value = copy; + pdfBytes.value = new Uint8Array(resp.data as ArrayBuffer); } catch (e: any) { - pdfData.value = null; + pdfBytes.value = null; docHtml.value = '

PDF 文件加载失败

'; } // 同时加载 HTML 用于笔记功能(后台) @@ -911,6 +914,9 @@ onMounted(async () => { flex: 1; overflow: auto; position: relative; padding: 0; .view-md { padding: 20px; + // 覆盖 PyMuPDF get_text("html") 输出的固定宽度 + :deep(div[style*="width:"]) { width: 100% !important; max-width: 100% !important; } + :deep(.pdf-page > div) { width: 100% !important; } :deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; } :deep(.highlight) { background: #D0EAC8; } :deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; } diff --git a/langchain-chat/server/knowledge_base/kb_doc_api.py b/langchain-chat/server/knowledge_base/kb_doc_api.py index 77a6788..7cecb55 100644 --- a/langchain-chat/server/knowledge_base/kb_doc_api.py +++ b/langchain-chat/server/knowledge_base/kb_doc_api.py @@ -269,62 +269,6 @@ def upload_docs( return BaseResponse(code=200, msg="文件上传与向量化完成", data={"failed_files": failed_files}) -def _background_llm_and_vectorize( - knowledge_base_name: str, - file_names: List[str], - chunk_size: int, - chunk_overlap: int, - zh_title_enhance: bool, - docs: dict, - not_refresh_vs_cache: bool, -): - """后台线程:执行 LLM 导读生成 + 向量化,不阻塞上传响应。""" - import time - start_time = time.time() - - kb = KBServiceFactory.get_service_by_name(knowledge_base_name) - - # 1. 生成 LLM 导读(摘要、关键词、章节速览) - for filename in file_names: - try: - knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name) - new_loop = asyncio.new_event_loop() - asyncio.set_event_loop(new_loop) - try: - llm_result = new_loop.run_until_complete(knowledge_file.get_llm_result()) - finally: - new_loop.close() - - # 将 LLM 结果写入缓存文件,供 Java 后端轮询读取 - import json - cache_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "knowledge_base", knowledge_base_name) - os.makedirs(cache_dir, exist_ok=True) - cache_file = os.path.join(cache_dir, f"{filename}.llm_result.json") - with open(cache_file, 'w', encoding='utf-8') as f: - json.dump(llm_result, f, ensure_ascii=False) - logger.info(f"[后台] LLM 导读生成完成: {filename}") - except Exception as e: - logger.error(f"[后台] LLM 导读生成失败 {filename}: {e}") - - # 2. 向量化 - try: - _update_docs_impl( - knowledge_base_name=knowledge_base_name, - file_names=file_names, - override_custom_docs=True, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - zh_title_enhance=zh_title_enhance, - docs=docs, - not_refresh_vs_cache=True, - ) - if kb and not not_refresh_vs_cache: - kb.save_vector_store() - logger.info(f"[后台] 向量化完成,总耗时: {time.time() - start_time:.2f}s") - except Exception as e: - logger.error(f"[后台] 向量化失败: {e}") - - def upload_docs_new( files: List[UploadFile] = File(..., description="上传文件,支持多文件"), knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]), @@ -338,7 +282,7 @@ def upload_docs_new( not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"), ) -> BaseResponse: """ - API接口:上传文件,先提取全文快速返回,LLM导读+向量化后台异步执行 + API接口:上传文件,并/或向量化 """ import time start_time = time.time() @@ -360,50 +304,63 @@ def upload_docs_new( file_names = list(docs.keys()) llm_results = {} - # 保存文件到磁盘 + 提取全文(快速操作) + # 先将上传的文件保存到磁盘 for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override): filename = result["data"]["file_name"] if result["code"] != 200: failed_files[filename] = result["msg"] + if filename not in file_names: file_names.append(filename) - # 仅提取全文(快速),不调用 LLM + # 生成摘要、关键词、章节速览(模型已优化为 deepseek-v3) try: knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name) - full_text_data = knowledge_file.get_full_text() - import json as _json - try: - full_text = _json.loads(full_text_data).get("full_text", "") - except: - full_text = "" + import concurrent.futures + def run_async_in_thread(): + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + try: + return new_loop.run_until_complete(knowledge_file.get_llm_result()) + finally: + new_loop.close() + + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(run_async_in_thread) + llm_result = future.result() llm_results[filename] = { - "full_text": full_text, - "article_abstract": "导读生成中...", - "article_keywords": "导读生成中...", - "article_paragraph": "导读生成中..." + "full_text": llm_result.get("full_text", "获取全文失败"), + "article_abstract": llm_result.get("article_abstract", "生成摘要失败"), + "article_keywords": llm_result.get("article_keywords", "生成关键词失败"), + "article_paragraph": llm_result.get("article_paragraph", "生成章节速览失败") } except Exception as e: - logger.error(f"提取全文失败 {filename}: {e}") + logger.error(f"生成LLM结果时出错:{e}", exc_info=e if log_verbose else None) llm_results[filename] = { - "full_text": "", - "article_abstract": "导读生成中...", - "article_keywords": "导读生成中...", - "article_paragraph": "导读生成中..." + "article_abstract": "生成摘要失败", + "article_keywords": "生成关键词失败", + "article_paragraph": "生成章节速览失败" } - # 后台异步执行 LLM 导读 + 向量化(不阻塞响应) - import threading - bg_thread = threading.Thread( - target=_background_llm_and_vectorize, - args=(knowledge_base_name, file_names, chunk_size, chunk_overlap, - zh_title_enhance, docs, not_refresh_vs_cache), - daemon=True - ) - bg_thread.start() - - logger.info(f"文件上传+全文提取用时: {time.time() - start_time:.2f}s,LLM+向量化已转后台") - return BaseResponse(code=200, msg="文件上传完成,导读生成中", data={ + # 对保存的文件进行向量化 + if to_vector_store: + update_st = time.time() + result = _update_docs_impl( + knowledge_base_name=knowledge_base_name, + file_names=file_names, + override_custom_docs=True, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance, + docs=docs, + not_refresh_vs_cache=True, + ) + failed_files.update(result.data["failed_files"]) + if not not_refresh_vs_cache: + kb.save_vector_store() + logger.info(f'向量化用时:{time.time() - update_st}') + logger.info(f"总执行时间: {time.time() - start_time:.2f}s") + return BaseResponse(code=200, msg="文件上传与向量化完成", data={ "failed_files": failed_files, "llm_results": llm_results })