import urllib from langchain_core.pydantic_v1 import BaseModel, Extra, Field, root_validator import requests from typing import Dict, List from configs import kgo_search_url, kgo_professional_search_url from configs import LLM_MODELS, TEMPERATURE, MAX_TOKENS from server.chat.policy_fun_iast import get_llm_model_response from datetime import datetime import json from configs.basic_config import * class KgoSearchAPIWrapper(BaseModel): search_kwargs: dict = Field(default_factory=dict) search_map = { "全部": "1000", "期刊论文": "1002", "学位论文": "1003", "会议论文": "1004", "政策": "1005", "成果": "1006", "科技成果": "1006", "项目": "1007", "报告": "1010", "图书": "1011", "外文期刊论文": "1013", "外文资料": "1013", "期刊": "1015", "专利": "3001", "新闻": "4001", "数据集": "4004", "视频": "4005", "统计数据": "6001" } class Config: """Configuration for this pydantic object.""" extra = Extra.forbid def _kgo_search_results(self, kgo_search_type: str, search_text: str) -> List[dict]: params = { "type": kgo_search_type, "model": 20, "text": search_text, **self.search_kwargs, } response = requests.get( kgo_search_url, params=params, # type: ignore ) response.raise_for_status() search_results = response.json() # print(search_results) if "code" in search_results and search_results["code"] == 10000: return search_results["results"]["datas"] return [] def _kgo_professional_search_results(self, kgo_search_type: str, search_text: str, origin_query:str) -> List[dict]: # 将逗号排除在编码范围之外,不进行编码 # encoded_kgo_search_type = ','.join(kgo_search_type.split(',')) # params = { # "model": 20, # "express": "theme=" + search_text, # **self.search_kwargs, # } year = get_llm_model_response( strategy_name="kgo_get_year", llm_model_name=LLM_MODELS[0], template_prompt_name="kgo_get_year", prompt_param_dict={ "query": origin_query, "year": datetime.now().strftime("%Y") }, temperature=TEMPERATURE, max_tokens=MAX_TOKENS ) logger.info(f'KGO资源发布时间表达式:{year}') # res = {} # for response in solve_problem(query=search_text, conversation_id="", history=[], model_name=LLM_MODELS[0],temperature=TEMPERATURE,max_tokens=MAX_TOKENS,prompt_name="kgo_get_year",stream=False): # res["text"] += json.loads(response)["text"] params = { "model": 20, "express": "theme=" + search_text + " AND " + year, **self.search_kwargs, } # 使用 urllib.parse.urlencode 构建查询参数部分 query_params = urllib.parse.urlencode(params) # 构建完整的 URL url = f"{kgo_professional_search_url}?type={kgo_search_type}&{query_params}" logger.info(f'KGO专业检索请求URL: {url}') try: # 发起网络请求 response = requests.get(url) response.raise_for_status() search_results = response.json() # 检查返回结果 if "code" in search_results and search_results["code"] == 10000: if "datas" in search_results["results"]: return search_results["results"]["datas"] except requests.RequestException as e: # 处理请求异常 print(f"请求异常: {e}") # 如果出现任何错误或未找到结果,返回空列表 return [] @root_validator(pre=True) def validate_environment(cls, values: Dict) -> Dict: """Validate that python package exists in environment.""" return values def run(self, text: str, kgo_search_type: str) -> str: """Run query through Kgo Search and parse result.""" snippets = [] results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type) if len(results) == 0: return "No good Kgo Search Result was found" for result in results: snippets.append(result["snippet"]) return " ".join(snippets) def results(self, text: str, kgo_search_type: str,origin_query:str) -> List[Dict]: """Run query through Kgo Search and return metadata. Args: text: The query to search for. kgo_search_type: The type of results in search_map to return. Returns: A list of dictionaries with the following keys: snippet - The description of the result. title - The title of the result. link - The link to the result. author - The author of the result. """ metadata_results = [] results = self._kgo_professional_search_results(search_text=text, kgo_search_type=kgo_search_type,origin_query=origin_query) if not results: return [{"Result": "No good Kgo Search Result was found"}] i = 0 for result in results: i += 1 snippet = result.get("AB", "") title = result.get("title", "") publish_year = result.get("publish_year", "") source_table_name = result.get("source_table_name", "") resource_type = result.get("resource_type", "") _id = result.get("_id", "") author = ",".join(result.get("author", [])) keywords = ",".join(result.get("keywords", [])) link = "" if source_table_name and resource_type: link = f"https://kgo.ckcest.cn/kgo/detail/{self.search_map.get(resource_type, '')}/{source_table_name}/{_id}.html" metadata_result = { "snippet": snippet, "title": title, "link": link, "author": author, "keywords": keywords, "publish_year": publish_year, "resource_type": resource_type } metadata_results.append(metadata_result) if i == 5: break # print("metadata_results:", metadata_results) return metadata_results def results_normal(self, text: str, kgo_search_type: str) -> List[Dict]: metadata_results = [] results = self._kgo_search_results(search_text=text, kgo_search_type=kgo_search_type) if len(results) == 0: return [{"Result": "No good Kgo Search Result was found"}] for result in results: metadata_result = { "snippet": result["AB"] if "AB" in result.keys() else "", "publish_year": result["publish_year"] if "publish_year" in result.keys() else "", "resource_type": result["resource_type"] if "resource_type" in result.keys() else "", "title": result["title"] if "title" in result.keys() else "", "link": "https://kgo.ckcest.cn/kgo/detail/" + self.search_map[result["resource_type"]] + "/" + result["source_table_name"] + "/" + result["_id"] + ".html" if "source_table_name" in result.keys() and "resource_type" in result.keys() else "", "keywords": result["keywords"] if "keywords" in result.keys() else "", # "author": result["author_json"]["name"] } # print("metadata_result:", metadata_result) metadata_results.append(metadata_result) return metadata_results if __name__ == '__main__': search = KgoSearchAPIWrapper() search_results = search.results("物理气相沉积", "1000") print(search_results) # docs = [] # for result in search_results: # print(result["snippet"]) # # print("snippet:" + result["snippet"]) # # print("link:" + result["link"]) # # print("title:" + result["title"]) # doc = Document(page_content=result["snippet"] if "snippet" in result.keys() else "", # metadata={"source": result["link"] if "link" in result.keys() else "", # "filename": result["title"] if "title" in result.keys() else ""}) # docs.append(doc) # print(docs)