[前端+RAG] 恢复同步上传修复导读生成;用Uint8Array存PDF字节修复detached;CSS覆盖PDF阅读模式空白
This commit is contained in:
@@ -283,7 +283,14 @@ provide('selectedFile', selectedFile);
|
|||||||
const docHtml = ref('');
|
const docHtml = ref('');
|
||||||
const fileContent = ref(null);
|
const fileContent = ref(null);
|
||||||
const readingBox = ref(null);
|
const readingBox = ref(null);
|
||||||
const pdfData = ref<ArrayBuffer | null>(null);
|
const pdfBytes = ref<Uint8Array | null>(null); // 存原始字节,不会被 detach
|
||||||
|
const pdfData = computed(() => {
|
||||||
|
// 每次访问时复制一份新的 ArrayBuffer 给 PdfViewer
|
||||||
|
if (!pdfBytes.value) return null;
|
||||||
|
const copy = new ArrayBuffer(pdfBytes.value.byteLength);
|
||||||
|
new Uint8Array(copy).set(pdfBytes.value);
|
||||||
|
return copy;
|
||||||
|
});
|
||||||
const readingMode = ref(false);
|
const readingMode = ref(false);
|
||||||
const fileType = computed(() => {
|
const fileType = computed(() => {
|
||||||
const name = selectedFile.value?.fileName || '';
|
const name = selectedFile.value?.fileName || '';
|
||||||
@@ -447,7 +454,7 @@ const handleNodeClick = async (data: any) => {
|
|||||||
if (ext === 'pdf') {
|
if (ext === 'pdf') {
|
||||||
await loadPdfFile();
|
await loadPdfFile();
|
||||||
} else {
|
} else {
|
||||||
pdfData.value = null;
|
pdfBytes.value = null;
|
||||||
await loadFileContent();
|
await loadFileContent();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -460,13 +467,9 @@ const loadPdfFile = async () => {
|
|||||||
params: { fileId: selectedFile.value.fileId },
|
params: { fileId: selectedFile.value.fileId },
|
||||||
responseType: 'arraybuffer'
|
responseType: 'arraybuffer'
|
||||||
});
|
});
|
||||||
// 复制 ArrayBuffer 避免被 Vue 响应式代理导致 detached
|
pdfBytes.value = new Uint8Array(resp.data as ArrayBuffer);
|
||||||
const src = resp.data as ArrayBuffer;
|
|
||||||
const copy = new ArrayBuffer(src.byteLength);
|
|
||||||
new Uint8Array(copy).set(new Uint8Array(src));
|
|
||||||
pdfData.value = copy;
|
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
pdfData.value = null;
|
pdfBytes.value = null;
|
||||||
docHtml.value = '<p style="color:#999;text-align:center;margin-top:40px;">PDF 文件加载失败</p>';
|
docHtml.value = '<p style="color:#999;text-align:center;margin-top:40px;">PDF 文件加载失败</p>';
|
||||||
}
|
}
|
||||||
// 同时加载 HTML 用于笔记功能(后台)
|
// 同时加载 HTML 用于笔记功能(后台)
|
||||||
@@ -911,6 +914,9 @@ onMounted(async () => {
|
|||||||
flex: 1; overflow: auto; position: relative; padding: 0;
|
flex: 1; overflow: auto; position: relative; padding: 0;
|
||||||
.view-md {
|
.view-md {
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
|
// 覆盖 PyMuPDF get_text("html") 输出的固定宽度
|
||||||
|
:deep(div[style*="width:"]) { width: 100% !important; max-width: 100% !important; }
|
||||||
|
:deep(.pdf-page > div) { width: 100% !important; }
|
||||||
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
|
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
|
||||||
:deep(.highlight) { background: #D0EAC8; }
|
:deep(.highlight) { background: #D0EAC8; }
|
||||||
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
|
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
|
||||||
|
|||||||
@@ -269,62 +269,6 @@ def upload_docs(
|
|||||||
return BaseResponse(code=200, msg="文件上传与向量化完成", data={"failed_files": failed_files})
|
return BaseResponse(code=200, msg="文件上传与向量化完成", data={"failed_files": failed_files})
|
||||||
|
|
||||||
|
|
||||||
def _background_llm_and_vectorize(
|
|
||||||
knowledge_base_name: str,
|
|
||||||
file_names: List[str],
|
|
||||||
chunk_size: int,
|
|
||||||
chunk_overlap: int,
|
|
||||||
zh_title_enhance: bool,
|
|
||||||
docs: dict,
|
|
||||||
not_refresh_vs_cache: bool,
|
|
||||||
):
|
|
||||||
"""后台线程:执行 LLM 导读生成 + 向量化,不阻塞上传响应。"""
|
|
||||||
import time
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
|
||||||
|
|
||||||
# 1. 生成 LLM 导读(摘要、关键词、章节速览)
|
|
||||||
for filename in file_names:
|
|
||||||
try:
|
|
||||||
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
|
|
||||||
new_loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(new_loop)
|
|
||||||
try:
|
|
||||||
llm_result = new_loop.run_until_complete(knowledge_file.get_llm_result())
|
|
||||||
finally:
|
|
||||||
new_loop.close()
|
|
||||||
|
|
||||||
# 将 LLM 结果写入缓存文件,供 Java 后端轮询读取
|
|
||||||
import json
|
|
||||||
cache_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "knowledge_base", knowledge_base_name)
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
|
||||||
cache_file = os.path.join(cache_dir, f"{filename}.llm_result.json")
|
|
||||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(llm_result, f, ensure_ascii=False)
|
|
||||||
logger.info(f"[后台] LLM 导读生成完成: {filename}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"[后台] LLM 导读生成失败 {filename}: {e}")
|
|
||||||
|
|
||||||
# 2. 向量化
|
|
||||||
try:
|
|
||||||
_update_docs_impl(
|
|
||||||
knowledge_base_name=knowledge_base_name,
|
|
||||||
file_names=file_names,
|
|
||||||
override_custom_docs=True,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
chunk_overlap=chunk_overlap,
|
|
||||||
zh_title_enhance=zh_title_enhance,
|
|
||||||
docs=docs,
|
|
||||||
not_refresh_vs_cache=True,
|
|
||||||
)
|
|
||||||
if kb and not not_refresh_vs_cache:
|
|
||||||
kb.save_vector_store()
|
|
||||||
logger.info(f"[后台] 向量化完成,总耗时: {time.time() - start_time:.2f}s")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"[后台] 向量化失败: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def upload_docs_new(
|
def upload_docs_new(
|
||||||
files: List[UploadFile] = File(..., description="上传文件,支持多文件"),
|
files: List[UploadFile] = File(..., description="上传文件,支持多文件"),
|
||||||
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
|
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
|
||||||
@@ -338,7 +282,7 @@ def upload_docs_new(
|
|||||||
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"),
|
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"),
|
||||||
) -> BaseResponse:
|
) -> BaseResponse:
|
||||||
"""
|
"""
|
||||||
API接口:上传文件,先提取全文快速返回,LLM导读+向量化后台异步执行
|
API接口:上传文件,并/或向量化
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@@ -360,50 +304,63 @@ def upload_docs_new(
|
|||||||
file_names = list(docs.keys())
|
file_names = list(docs.keys())
|
||||||
llm_results = {}
|
llm_results = {}
|
||||||
|
|
||||||
# 保存文件到磁盘 + 提取全文(快速操作)
|
# 先将上传的文件保存到磁盘
|
||||||
for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override):
|
for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override):
|
||||||
filename = result["data"]["file_name"]
|
filename = result["data"]["file_name"]
|
||||||
if result["code"] != 200:
|
if result["code"] != 200:
|
||||||
failed_files[filename] = result["msg"]
|
failed_files[filename] = result["msg"]
|
||||||
|
|
||||||
if filename not in file_names:
|
if filename not in file_names:
|
||||||
file_names.append(filename)
|
file_names.append(filename)
|
||||||
|
|
||||||
# 仅提取全文(快速),不调用 LLM
|
# 生成摘要、关键词、章节速览(模型已优化为 deepseek-v3)
|
||||||
try:
|
try:
|
||||||
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
|
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
|
||||||
full_text_data = knowledge_file.get_full_text()
|
import concurrent.futures
|
||||||
import json as _json
|
def run_async_in_thread():
|
||||||
try:
|
new_loop = asyncio.new_event_loop()
|
||||||
full_text = _json.loads(full_text_data).get("full_text", "")
|
asyncio.set_event_loop(new_loop)
|
||||||
except:
|
try:
|
||||||
full_text = ""
|
return new_loop.run_until_complete(knowledge_file.get_llm_result())
|
||||||
|
finally:
|
||||||
|
new_loop.close()
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
|
future = executor.submit(run_async_in_thread)
|
||||||
|
llm_result = future.result()
|
||||||
llm_results[filename] = {
|
llm_results[filename] = {
|
||||||
"full_text": full_text,
|
"full_text": llm_result.get("full_text", "获取全文失败"),
|
||||||
"article_abstract": "导读生成中...",
|
"article_abstract": llm_result.get("article_abstract", "生成摘要失败"),
|
||||||
"article_keywords": "导读生成中...",
|
"article_keywords": llm_result.get("article_keywords", "生成关键词失败"),
|
||||||
"article_paragraph": "导读生成中..."
|
"article_paragraph": llm_result.get("article_paragraph", "生成章节速览失败")
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"提取全文失败 {filename}: {e}")
|
logger.error(f"生成LLM结果时出错:{e}", exc_info=e if log_verbose else None)
|
||||||
llm_results[filename] = {
|
llm_results[filename] = {
|
||||||
"full_text": "",
|
"article_abstract": "生成摘要失败",
|
||||||
"article_abstract": "导读生成中...",
|
"article_keywords": "生成关键词失败",
|
||||||
"article_keywords": "导读生成中...",
|
"article_paragraph": "生成章节速览失败"
|
||||||
"article_paragraph": "导读生成中..."
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# 后台异步执行 LLM 导读 + 向量化(不阻塞响应)
|
# 对保存的文件进行向量化
|
||||||
import threading
|
if to_vector_store:
|
||||||
bg_thread = threading.Thread(
|
update_st = time.time()
|
||||||
target=_background_llm_and_vectorize,
|
result = _update_docs_impl(
|
||||||
args=(knowledge_base_name, file_names, chunk_size, chunk_overlap,
|
knowledge_base_name=knowledge_base_name,
|
||||||
zh_title_enhance, docs, not_refresh_vs_cache),
|
file_names=file_names,
|
||||||
daemon=True
|
override_custom_docs=True,
|
||||||
)
|
chunk_size=chunk_size,
|
||||||
bg_thread.start()
|
chunk_overlap=chunk_overlap,
|
||||||
|
zh_title_enhance=zh_title_enhance,
|
||||||
logger.info(f"文件上传+全文提取用时: {time.time() - start_time:.2f}s,LLM+向量化已转后台")
|
docs=docs,
|
||||||
return BaseResponse(code=200, msg="文件上传完成,导读生成中", data={
|
not_refresh_vs_cache=True,
|
||||||
|
)
|
||||||
|
failed_files.update(result.data["failed_files"])
|
||||||
|
if not not_refresh_vs_cache:
|
||||||
|
kb.save_vector_store()
|
||||||
|
logger.info(f'向量化用时:{time.time() - update_st}')
|
||||||
|
logger.info(f"总执行时间: {time.time() - start_time:.2f}s")
|
||||||
|
return BaseResponse(code=200, msg="文件上传与向量化完成", data={
|
||||||
"failed_files": failed_files,
|
"failed_files": failed_files,
|
||||||
"llm_results": llm_results
|
"llm_results": llm_results
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user