Files
gangyan/langchain-chat/server/chat/KgoSearchAPIWrapper.py

212 lines
8.4 KiB
Python
Raw Normal View History

import urllib
from langchain_core.pydantic_v1 import BaseModel, Extra, Field, root_validator
import requests
from typing import Dict, List
from configs import kgo_search_url, kgo_professional_search_url
from configs import LLM_MODELS, TEMPERATURE, MAX_TOKENS
from server.chat.policy_fun_iast import get_llm_model_response
from datetime import datetime
import json
from configs.basic_config import *
class KgoSearchAPIWrapper(BaseModel):
search_kwargs: dict = Field(default_factory=dict)
search_map = {
"全部": "1000",
"期刊论文": "1002",
"学位论文": "1003",
"会议论文": "1004",
"政策": "1005",
"成果": "1006",
"科技成果": "1006",
"项目": "1007",
"报告": "1010",
"图书": "1011",
"外文期刊论文": "1013",
"外文资料": "1013",
"期刊": "1015",
"专利": "3001",
"新闻": "4001",
"数据集": "4004",
"视频": "4005",
"统计数据": "6001"
}
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def _kgo_search_results(self, kgo_search_type: str, search_text: str) -> List[dict]:
params = {
"type": kgo_search_type,
"model": 20,
"text": search_text,
**self.search_kwargs,
}
response = requests.get(
kgo_search_url,
params=params, # type: ignore
)
response.raise_for_status()
search_results = response.json()
# print(search_results)
if "code" in search_results and search_results["code"] == 10000:
return search_results["results"]["datas"]
return []
def _kgo_professional_search_results(self, kgo_search_type: str, search_text: str, origin_query:str) -> List[dict]:
# 将逗号排除在编码范围之外,不进行编码
# encoded_kgo_search_type = ','.join(kgo_search_type.split(','))
# params = {
# "model": 20,
# "express": "theme=" + search_text,
# **self.search_kwargs,
# }
year = get_llm_model_response(
strategy_name="kgo_get_year",
llm_model_name=LLM_MODELS[0],
template_prompt_name="kgo_get_year",
prompt_param_dict={
"query": origin_query,
"year": datetime.now().strftime("%Y")
},
temperature=TEMPERATURE,
max_tokens=MAX_TOKENS
)
logger.info(f'KGO资源发布时间表达式{year}')
# res = {}
# for response in solve_problem(query=search_text, conversation_id="", history=[], model_name=LLM_MODELS[0],temperature=TEMPERATURE,max_tokens=MAX_TOKENS,prompt_name="kgo_get_year",stream=False):
# res["text"] += json.loads(response)["text"]
params = {
"model": 20,
"express": "theme=" + search_text + " AND " + year,
**self.search_kwargs,
}
# 使用 urllib.parse.urlencode 构建查询参数部分
query_params = urllib.parse.urlencode(params)
# 构建完整的 URL
url = f"{kgo_professional_search_url}?type={kgo_search_type}&{query_params}"
logger.info(f'KGO专业检索请求URL: {url}')
try:
# 发起网络请求
response = requests.get(url)
response.raise_for_status()
search_results = response.json()
# 检查返回结果
if "code" in search_results and search_results["code"] == 10000:
if "datas" in search_results["results"]:
return search_results["results"]["datas"]
except requests.RequestException as e:
# 处理请求异常
print(f"请求异常: {e}")
# 如果出现任何错误或未找到结果,返回空列表
return []
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that python package exists in environment."""
return values
def run(self, text: str, kgo_search_type: str) -> str:
"""Run query through Kgo Search and parse result."""
snippets = []
results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type)
if len(results) == 0:
return "No good Kgo Search Result was found"
for result in results:
snippets.append(result["snippet"])
return " ".join(snippets)
def results(self, text: str, kgo_search_type: str,origin_query:str) -> List[Dict]:
"""Run query through Kgo Search and return metadata.
Args:
text: The query to search for.
kgo_search_type: The type of results in search_map to return.
Returns:
A list of dictionaries with the following keys:
snippet - The description of the result.
title - The title of the result.
link - The link to the result.
author - The author of the result.
"""
metadata_results = []
results = self._kgo_professional_search_results(search_text=text, kgo_search_type=kgo_search_type,origin_query=origin_query)
if not results:
return [{"Result": "No good Kgo Search Result was found"}]
i = 0
for result in results:
i += 1
snippet = result.get("AB", "")
title = result.get("title", "")
publish_year = result.get("publish_year", "")
source_table_name = result.get("source_table_name", "")
resource_type = result.get("resource_type", "")
_id = result.get("_id", "")
author = ",".join(result.get("author", []))
keywords = ",".join(result.get("keywords", []))
link = ""
if source_table_name and resource_type:
link = f"https://kgo.ckcest.cn/kgo/detail/{self.search_map.get(resource_type, '')}/{source_table_name}/{_id}.html"
metadata_result = {
"snippet": snippet,
"title": title,
"link": link,
"author": author,
"keywords": keywords,
"publish_year": publish_year,
"resource_type": resource_type
}
metadata_results.append(metadata_result)
if i == 5:
break
# print("metadata_results:", metadata_results)
return metadata_results
def results_normal(self, text: str, kgo_search_type: str) -> List[Dict]:
metadata_results = []
results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type)
if len(results) == 0:
return [{"Result": "No good Kgo Search Result was found"}]
for result in results:
metadata_result = {
"snippet": result["AB"] if "AB" in result.keys() else "",
"publish_year": result["publish_year"] if "publish_year" in result.keys() else "",
"resource_type": result["resource_type"] if "resource_type" in result.keys() else "",
"title": result["title"] if "title" in result.keys() else "",
"link": "https://kgo.ckcest.cn/kgo/detail/" + self.search_map[result["resource_type"]]
+ "/" + result["source_table_name"] + "/" + result["_id"] + ".html"
if "source_table_name" in result.keys() and "resource_type" in result.keys() else "",
"keywords": result["keywords"] if "keywords" in result.keys() else "",
# "author": result["author_json"]["name"]
}
# print("metadata_result:", metadata_result)
metadata_results.append(metadata_result)
return metadata_results
if __name__ == '__main__':
search = KgoSearchAPIWrapper()
search_results = search.results("物理气相沉积", "1000")
print(search_results)
# docs = []
# for result in search_results:
# print(result["snippet"])
# # print("snippet:" + result["snippet"])
# # print("link:" + result["link"])
# # print("title:" + result["title"])
# doc = Document(page_content=result["snippet"] if "snippet" in result.keys() else "",
# metadata={"source": result["link"] if "link" in result.keys() else "",
# "filename": result["title"] if "title" in result.keys() else ""})
# docs.append(doc)
# print(docs)