[前端+后端+RAG] 检索范围切换(当前文件/整个知识库);联网搜索功能(SearXNG);搜索结果带网络链接;修复RAG检索source格式不匹配bug

This commit is contained in:
2026-04-07 15:02:54 +08:00
parent a5110da4e8
commit e1e5d4f30d
6 changed files with 158 additions and 36 deletions

View File

@@ -50,6 +50,7 @@ async def self_kb_chat(
"content": "虎头虎脑"}]]
),
stream: bool = Body(True, description="流式输出"),
web_search: bool = Body(False, description="是否开启联网搜索"),
):
"""
个人知识库对话api\n
@@ -149,7 +150,29 @@ async def self_kb_chat(
except Exception as e:
logger.error(f"个人知识库问答路由错误: {self_kb_route}", exc_info=True)
docs = []
logger.info(f"个人知识库问答source_documents: {docs}")
logger.info(f"个人知识库问答source_documents: {len(docs)}")
# 联网搜索
web_search_context = ""
web_search_results = [] # 保存搜索结果供后面引用
if web_search:
try:
from server.chat.ZhipuSearchAPI import ZhipuSearchAPIWrapper
searcher = ZhipuSearchAPIWrapper()
web_results = searcher.zhipu_search(search_query)
web_search_results = web_results[:5] if web_results else []
if web_results:
web_parts = []
for i, r in enumerate(web_results[:5], 1):
title = r.get("title", "")
content = r.get("content", "")[:300]
url = r.get("url", "")
web_parts.append(f"[{i}] {title}\n{content}\n来源: {url}")
web_search_context = "\n\n【联网搜索结果】\n" + "\n\n".join(web_parts)
logger.info(f"联网搜索获取到 {len(web_results)} 条结果")
except Exception as e:
logger.error(f"联网搜索失败: {e}")
# if SELF_USE_RERANKER:
# reranker_model_path = MODEL_PATH["reranker"].get(RERANKER_MODEL,"BAAI/bge-reranker-large")
# print("-----------------model path------------------")
@@ -184,25 +207,28 @@ async def self_kb_chat(
if '0' in self_kb_route:
context = "\n".join([doc.page_content for doc in docs]).strip("xa0")
logger.info(f"个人知识库问答 context 长度:{len(context)}")
# context_70 = context if len(context)<30000 else TextRank(context,num_sentences=70)
context = context[:40000] if len(context)>40000 else context
logger.info(f"截取后个人知识库问答 context 长度:{len(context)}")
context = context[:30000] if len(context)>30000 else context
if web_search_context:
context += web_search_context
logger.info(f"最终 context 长度:{len(context)}")
if history:
history = history if len(history) < 20000 else TextRank(history,num_sentences=1)
# logger.info(f"个人知识库问答 context 长度超过 30000使用 TextRank 算法进行降维得到 context 长度:{len(context)}")
chain = LLMChain(prompt=chat_prompt, llm=model1, verbose=True)
task = asyncio.create_task(wrap_done(
chain.acall({"context": context, "question": query, "history": history, "quote": quote, "fileName":fileNames}),
callback.done),
)
elif '1' in self_kb_route:
# 联网搜索结果作为额外文档加入
if web_search_context:
from langchain.docstore.document import Document as LCDocument
docs.append(LCDocument(page_content=web_search_context, metadata={"source": "web_search"}))
chain = load_qa_chain(
model,
chain_type="stuff",
prompt=chat_prompt,
model,
chain_type="stuff",
prompt=chat_prompt,
verbose=True
)
# Begin a task that runs in the background.
task = asyncio.create_task(wrap_done(
chain.acall({"input_documents": docs, "question": query, "history": history, "quote": quote, "fileName":fileNames}),
callback.done),
@@ -235,14 +261,18 @@ async def self_kb_chat(
yield json.dumps(response, ensure_ascii=False)
await task
source_documents = []
if len(docs) == 0: # 没有找到相关文档
if len(docs) == 0 and not web_search_context:
source_documents.append(f"""暂未从本篇文献中找到答案,该回答为大模型自身能力解答!""")
else:
# 去除文件扩展名
# fileNames_without_ext = [name.rsplit('.', 1)[0] for name in fileNames]
# 连接文件名(如果有多个文件名)
# joined_fileNames = ', '.join(fileNames_without_ext)
source_documents.append(f"""[{len(source_documents) + 1}] [{docs[0].metadata.get("source")}]()\n""")
if len(docs) > 0:
source_documents.append(f"""[{len(source_documents) + 1}] [{docs[0].metadata.get("source")}]()\n""")
# 联网搜索结果链接
if web_search_results:
for r in web_search_results:
title = r.get("title", "").replace("\n", "")
url = r.get("url", "")
if title and url:
source_documents.append(f"""[{len(source_documents) + 1}] [{title}]({url})\n""")
yield json.dumps({"docs": source_documents}, ensure_ascii=False)
return EventSourceResponse(knowledge_base_chat_iterator(query))