212 lines
8.4 KiB
Python
212 lines
8.4 KiB
Python
import urllib
|
||
|
||
from langchain_core.pydantic_v1 import BaseModel, Extra, Field, root_validator
|
||
import requests
|
||
from typing import Dict, List
|
||
from configs import kgo_search_url, kgo_professional_search_url
|
||
from configs import LLM_MODELS, TEMPERATURE, MAX_TOKENS
|
||
from server.chat.policy_fun_iast import get_llm_model_response
|
||
from datetime import datetime
|
||
import json
|
||
from configs.basic_config import *
|
||
class KgoSearchAPIWrapper(BaseModel):
|
||
search_kwargs: dict = Field(default_factory=dict)
|
||
|
||
search_map = {
|
||
"全部": "1000",
|
||
"期刊论文": "1002",
|
||
"学位论文": "1003",
|
||
"会议论文": "1004",
|
||
"政策": "1005",
|
||
"成果": "1006",
|
||
"科技成果": "1006",
|
||
"项目": "1007",
|
||
"报告": "1010",
|
||
"图书": "1011",
|
||
"外文期刊论文": "1013",
|
||
"外文资料": "1013",
|
||
"期刊": "1015",
|
||
"专利": "3001",
|
||
"新闻": "4001",
|
||
"数据集": "4004",
|
||
"视频": "4005",
|
||
"统计数据": "6001"
|
||
}
|
||
|
||
class Config:
|
||
"""Configuration for this pydantic object."""
|
||
|
||
extra = Extra.forbid
|
||
|
||
def _kgo_search_results(self, kgo_search_type: str, search_text: str) -> List[dict]:
|
||
params = {
|
||
"type": kgo_search_type,
|
||
"model": 20,
|
||
"text": search_text,
|
||
**self.search_kwargs,
|
||
}
|
||
response = requests.get(
|
||
kgo_search_url,
|
||
params=params, # type: ignore
|
||
)
|
||
response.raise_for_status()
|
||
search_results = response.json()
|
||
# print(search_results)
|
||
if "code" in search_results and search_results["code"] == 10000:
|
||
return search_results["results"]["datas"]
|
||
return []
|
||
|
||
def _kgo_professional_search_results(self, kgo_search_type: str, search_text: str, origin_query:str) -> List[dict]:
|
||
# 将逗号排除在编码范围之外,不进行编码
|
||
# encoded_kgo_search_type = ','.join(kgo_search_type.split(','))
|
||
# params = {
|
||
# "model": 20,
|
||
# "express": "theme=" + search_text,
|
||
# **self.search_kwargs,
|
||
# }
|
||
year = get_llm_model_response(
|
||
strategy_name="kgo_get_year",
|
||
llm_model_name=LLM_MODELS[0],
|
||
template_prompt_name="kgo_get_year",
|
||
prompt_param_dict={
|
||
"query": origin_query,
|
||
"year": datetime.now().strftime("%Y")
|
||
},
|
||
temperature=TEMPERATURE,
|
||
max_tokens=MAX_TOKENS
|
||
)
|
||
logger.info(f'KGO资源发布时间表达式:{year}')
|
||
# res = {}
|
||
# for response in solve_problem(query=search_text, conversation_id="", history=[], model_name=LLM_MODELS[0],temperature=TEMPERATURE,max_tokens=MAX_TOKENS,prompt_name="kgo_get_year",stream=False):
|
||
# res["text"] += json.loads(response)["text"]
|
||
params = {
|
||
"model": 20,
|
||
"express": "theme=" + search_text + " AND " + year,
|
||
**self.search_kwargs,
|
||
}
|
||
# 使用 urllib.parse.urlencode 构建查询参数部分
|
||
query_params = urllib.parse.urlencode(params)
|
||
# 构建完整的 URL
|
||
url = f"{kgo_professional_search_url}?type={kgo_search_type}&{query_params}"
|
||
logger.info(f'KGO专业检索请求URL: {url}')
|
||
try:
|
||
# 发起网络请求
|
||
response = requests.get(url)
|
||
response.raise_for_status()
|
||
search_results = response.json()
|
||
# 检查返回结果
|
||
if "code" in search_results and search_results["code"] == 10000:
|
||
if "datas" in search_results["results"]:
|
||
return search_results["results"]["datas"]
|
||
except requests.RequestException as e:
|
||
# 处理请求异常
|
||
print(f"请求异常: {e}")
|
||
|
||
# 如果出现任何错误或未找到结果,返回空列表
|
||
return []
|
||
|
||
@root_validator(pre=True)
|
||
def validate_environment(cls, values: Dict) -> Dict:
|
||
"""Validate that python package exists in environment."""
|
||
return values
|
||
|
||
def run(self, text: str, kgo_search_type: str) -> str:
|
||
"""Run query through Kgo Search and parse result."""
|
||
snippets = []
|
||
results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type)
|
||
if len(results) == 0:
|
||
return "No good Kgo Search Result was found"
|
||
for result in results:
|
||
snippets.append(result["snippet"])
|
||
|
||
return " ".join(snippets)
|
||
|
||
def results(self, text: str, kgo_search_type: str,origin_query:str) -> List[Dict]:
|
||
"""Run query through Kgo Search and return metadata.
|
||
|
||
Args:
|
||
text: The query to search for.
|
||
kgo_search_type: The type of results in search_map to return.
|
||
|
||
Returns:
|
||
A list of dictionaries with the following keys:
|
||
snippet - The description of the result.
|
||
title - The title of the result.
|
||
link - The link to the result.
|
||
author - The author of the result.
|
||
"""
|
||
metadata_results = []
|
||
results = self._kgo_professional_search_results(search_text=text, kgo_search_type=kgo_search_type,origin_query=origin_query)
|
||
|
||
if not results:
|
||
return [{"Result": "No good Kgo Search Result was found"}]
|
||
i = 0
|
||
for result in results:
|
||
i += 1
|
||
snippet = result.get("AB", "")
|
||
title = result.get("title", "")
|
||
publish_year = result.get("publish_year", "")
|
||
source_table_name = result.get("source_table_name", "")
|
||
resource_type = result.get("resource_type", "")
|
||
_id = result.get("_id", "")
|
||
author = ",".join(result.get("author", []))
|
||
keywords = ",".join(result.get("keywords", []))
|
||
|
||
link = ""
|
||
if source_table_name and resource_type:
|
||
link = f"https://kgo.ckcest.cn/kgo/detail/{self.search_map.get(resource_type, '')}/{source_table_name}/{_id}.html"
|
||
|
||
metadata_result = {
|
||
"snippet": snippet,
|
||
"title": title,
|
||
"link": link,
|
||
"author": author,
|
||
"keywords": keywords,
|
||
"publish_year": publish_year,
|
||
"resource_type": resource_type
|
||
}
|
||
metadata_results.append(metadata_result)
|
||
if i == 5:
|
||
break
|
||
# print("metadata_results:", metadata_results)
|
||
return metadata_results
|
||
|
||
def results_normal(self, text: str, kgo_search_type: str) -> List[Dict]:
|
||
metadata_results = []
|
||
results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type)
|
||
if len(results) == 0:
|
||
return [{"Result": "No good Kgo Search Result was found"}]
|
||
for result in results:
|
||
metadata_result = {
|
||
"snippet": result["AB"] if "AB" in result.keys() else "",
|
||
"publish_year": result["publish_year"] if "publish_year" in result.keys() else "",
|
||
"resource_type": result["resource_type"] if "resource_type" in result.keys() else "",
|
||
"title": result["title"] if "title" in result.keys() else "",
|
||
"link": "https://kgo.ckcest.cn/kgo/detail/" + self.search_map[result["resource_type"]]
|
||
+ "/" + result["source_table_name"] + "/" + result["_id"] + ".html"
|
||
if "source_table_name" in result.keys() and "resource_type" in result.keys() else "",
|
||
"keywords": result["keywords"] if "keywords" in result.keys() else "",
|
||
# "author": result["author_json"]["name"]
|
||
}
|
||
# print("metadata_result:", metadata_result)
|
||
metadata_results.append(metadata_result)
|
||
|
||
return metadata_results
|
||
|
||
|
||
if __name__ == '__main__':
|
||
search = KgoSearchAPIWrapper()
|
||
search_results = search.results("物理气相沉积", "1000")
|
||
print(search_results)
|
||
# docs = []
|
||
# for result in search_results:
|
||
# print(result["snippet"])
|
||
# # print("snippet:" + result["snippet"])
|
||
# # print("link:" + result["link"])
|
||
# # print("title:" + result["title"])
|
||
# doc = Document(page_content=result["snippet"] if "snippet" in result.keys() else "",
|
||
# metadata={"source": result["link"] if "link" in result.keys() else "",
|
||
# "filename": result["title"] if "title" in result.keys() else ""})
|
||
# docs.append(doc)
|
||
# print(docs)
|