[前端+RAG] 修复PDF文字重叠;上传异步化(LLM+向量化后台执行);摘要关键词模型改为deepseek-v3

This commit is contained in:
2026-04-02 14:10:08 +08:00
parent 0e25154468
commit 5158753b94
3 changed files with 122 additions and 116 deletions

View File

@@ -269,6 +269,62 @@ def upload_docs(
return BaseResponse(code=200, msg="文件上传与向量化完成", data={"failed_files": failed_files})
def _background_llm_and_vectorize(
knowledge_base_name: str,
file_names: List[str],
chunk_size: int,
chunk_overlap: int,
zh_title_enhance: bool,
docs: dict,
not_refresh_vs_cache: bool,
):
"""后台线程:执行 LLM 导读生成 + 向量化,不阻塞上传响应。"""
import time
start_time = time.time()
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
# 1. 生成 LLM 导读(摘要、关键词、章节速览)
for filename in file_names:
try:
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
try:
llm_result = new_loop.run_until_complete(knowledge_file.get_llm_result())
finally:
new_loop.close()
# 将 LLM 结果写入缓存文件,供 Java 后端轮询读取
import json
cache_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "knowledge_base", knowledge_base_name)
os.makedirs(cache_dir, exist_ok=True)
cache_file = os.path.join(cache_dir, f"{filename}.llm_result.json")
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(llm_result, f, ensure_ascii=False)
logger.info(f"[后台] LLM 导读生成完成: {filename}")
except Exception as e:
logger.error(f"[后台] LLM 导读生成失败 {filename}: {e}")
# 2. 向量化
try:
_update_docs_impl(
knowledge_base_name=knowledge_base_name,
file_names=file_names,
override_custom_docs=True,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance,
docs=docs,
not_refresh_vs_cache=True,
)
if kb and not not_refresh_vs_cache:
kb.save_vector_store()
logger.info(f"[后台] 向量化完成,总耗时: {time.time() - start_time:.2f}s")
except Exception as e:
logger.error(f"[后台] 向量化失败: {e}")
def upload_docs_new(
files: List[UploadFile] = File(..., description="上传文件,支持多文件"),
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
@@ -282,16 +338,15 @@ def upload_docs_new(
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库用于FAISS"),
) -> BaseResponse:
"""
API接口上传文件并/或向量化
API接口上传文件先提取全文快速返回LLM导读+向量化后台异步执行
"""
import time # 添加计时模块
import time
start_time = time.time()
if not validate_kb_name(knowledge_base_name):
return BaseResponse(code=403, msg="Don't attack me")
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
if kb is None:
# 自动创建知识库
kb = KBServiceFactory.get_service(knowledge_base_name, DEFAULT_VS_TYPE, EMBEDDING_MODEL)
try:
kb.create_kb()
@@ -303,68 +358,52 @@ def upload_docs_new(
failed_files = {}
file_names = list(docs.keys())
# 生成摘要、关键词、章节速览的结果存储
llm_results = {}
# 先将上传的文件保存到磁盘
# 保存文件到磁盘 + 提取全文(快速操作)
for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override):
filename = result["data"]["file_name"]
if result["code"] != 200:
failed_files[filename] = result["msg"]
if filename not in file_names:
file_names.append(filename)
# 针对成功上传的文件,生成摘要、关键词、章节速览
# 仅提取全文(快速),不调用 LLM
try:
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
# 使用线程池运行异步函数,避免事件循环冲突
import concurrent.futures
def run_async_in_thread():
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
try:
return new_loop.run_until_complete(knowledge_file.get_llm_result())
finally:
new_loop.close()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(run_async_in_thread)
llm_result = future.result()
full_text_data = knowledge_file.get_full_text()
import json as _json
try:
full_text = _json.loads(full_text_data).get("full_text", "")
except:
full_text = ""
llm_results[filename] = {
"full_text": llm_result.get("full_text", "获取全文失败"),
"article_abstract": llm_result.get("article_abstract", "生成摘要失败"),
"article_keywords": llm_result.get("article_keywords", "生成关键词失败"),
"article_paragraph": llm_result.get("article_paragraph", "生成章节速览失败")
"full_text": full_text,
"article_abstract": "导读生成中...",
"article_keywords": "导读生成中...",
"article_paragraph": "导读生成中..."
}
except Exception as e:
logger.error(f"生成LLM结果时出错{e}", exc_info=e if log_verbose else None)
logger.error(f"提取全文失败 {filename}: {e}")
llm_results[filename] = {
"article_abstract": "生成摘要失败",
"article_keywords": "生成关键词失败",
"article_paragraph": "生成章节速览失败"
"full_text": "",
"article_abstract": "导读生成中...",
"article_keywords": "导读生成中...",
"article_paragraph": "导读生成中..."
}
# 对保存的文件进行向量化
if to_vector_store:
update_st = time.time()
result = _update_docs_impl(
knowledge_base_name=knowledge_base_name,
file_names=file_names,
override_custom_docs=True,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance,
docs=docs,
not_refresh_vs_cache=True,
)
failed_files.update(result.data["failed_files"])
if not not_refresh_vs_cache:
kb.save_vector_store()
logger.info(f'向量化用时:{time.time() - update_st}')
logger.info(f"总执行时间: {time.time() - start_time:.2f}s")
return BaseResponse(code=200, msg="文件上传与向量化完成", data={
# 后台异步执行 LLM 导读 + 向量化(不阻塞响应)
import threading
bg_thread = threading.Thread(
target=_background_llm_and_vectorize,
args=(knowledge_base_name, file_names, chunk_size, chunk_overlap,
zh_title_enhance, docs, not_refresh_vs_cache),
daemon=True
)
bg_thread.start()
logger.info(f"文件上传+全文提取用时: {time.time() - start_time:.2f}sLLM+向量化已转后台")
return BaseResponse(code=200, msg="文件上传完成,导读生成中", data={
"failed_files": failed_files,
"llm_results": llm_results
})

View File

@@ -390,7 +390,7 @@ class KnowledgeFile:
llm_time = time.time()
abstract_task = get_llm_model_response_async(
strategy_name="gen_abstract",
llm_model_name=LLM_MODELS[1],
llm_model_name=LLM_MODELS[0],
template_prompt_name="gen_abstract",
prompt_param_dict={"context": full_text},
temperature=0.7,
@@ -399,7 +399,7 @@ class KnowledgeFile:
keywords_task = get_llm_model_response_async(
strategy_name="gen_keywords",
llm_model_name=LLM_MODELS[1],
llm_model_name=LLM_MODELS[0],
template_prompt_name="gen_keywords",
prompt_param_dict={"context": full_text},
temperature=0.7,