import os from pathlib import Path import re # 默认使用的知识库 # DEFAULT_KNOWLEDGE_BASE = "t_policy_total_bge_v1" SELF_KNOWLEDGE_BASE = re.compile(r'^p_.*') # 个人知识库名称 DEFAULT_KNOWLEDGE_BASE = "t_policy_total_bge_new_v2" DEFAULT_POLICY_BASE = DEFAULT_KNOWLEDGE_BASE #"t_policy_total_bge_new_v2" DEFAULT_POLICY_BASE_NAME = "政策库" DEFAULT_REPORT_BASE1 = DEFAULT_KNOWLEDGE_BASE # "t_strategy_report_bge_v2" DEFAULT_REPORT_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_report_v2" DEFAULT_REPORT_BASE_NAME = "报告库" DEFAULT_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"t_journal_article_bge_v1" DEFAULT_JOURNAL_BASE_NAME = "期刊论文库" GY_NEWS_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_news_v2" GY_NEWS_BASE_NAME = "冶金行业新闻库" GY_REPORT_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_report_v2" GY_REPORT_BASE_NAME = "冶金行业报告库" GY_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"gy_demo_journal_v3" GY_JOURNAL_BASE_NAME = "冶金专业知识库" OLD_POLICY_BASE = ['t_policy_total_bge_v1','t_policy_total_bge_new_v1'] # 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector, chromadb 全文检索引擎es DEFAULT_VS_TYPE = "milvus" # 新增冶金系列知识库常量 YJ_CH_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_ch_journal_bge_v1_recover" YJ_CH_JOURNAL_BASE_NAME = "冶金中文期刊库" YJ_NEWS_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_news_bge_v1_recover" YJ_NEWS_BASE_NAME = "冶金新闻库(2024年以及之前)" YJ_FOR_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_for_journal_bge_v1_recover" YJ_FOR_JOURNAL_BASE_NAME = "冶金外文期刊库" YJ_OA_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_oa_journal_bge_v2_recover" YJ_OA_JOURNAL_BASE_NAME = "冶金OA期刊库" YJ_POLICYS_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_policys_bge_v1_recover" YJ_POLICYS_BASE_NAME = "冶金政策库" # 专门的冶金新闻/期刊/报告可在此追加,如有需要 YJ_BASE_NAME = [YJ_CH_JOURNAL_BASE_NAME, YJ_NEWS_BASE_NAME, YJ_FOR_JOURNAL_BASE_NAME, YJ_OA_JOURNAL_BASE_NAME, YJ_POLICYS_BASE_NAME] CH_BASE_NAME = ( DEFAULT_POLICY_BASE_NAME, DEFAULT_JOURNAL_BASE_NAME, GY_NEWS_BASE_NAME, GY_REPORT_BASE_NAME, GY_JOURNAL_BASE_NAME, DEFAULT_REPORT_BASE_NAME, ) EN_BASE_NAME = [ DEFAULT_POLICY_BASE, DEFAULT_REPORT_BASE, DEFAULT_REPORT_BASE1, DEFAULT_JOURNAL_BASE, GY_NEWS_BASE, GY_REPORT_BASE, GY_JOURNAL_BASE, YJ_CH_JOURNAL_BASE, YJ_NEWS_BASE, YJ_FOR_JOURNAL_BASE, YJ_OA_JOURNAL_BASE, YJ_POLICYS_BASE, ] GY_BASE_NAME = [GY_NEWS_BASE,GY_REPORT_BASE,GY_JOURNAL_BASE] OLD_JOURNAL_BASE = ['t_journal_article_bge_v0'] # 缓存向量库数量(针对FAISS) CACHED_VS_NUM = 1 # 缓存临时向量库数量(针对FAISS),用于文件对话 CACHED_MEMO_VS_NUM = 10 # 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter) CHUNK_SIZE = 250 # 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter) OVERLAP_SIZE = 50 # 知识库匹配向量数量 VECTOR_SEARCH_TOP_K = 5 # 知识库匹配的距离阈值,一般取值范围在0-1之间,SCORE越小,距离越小从而相关度越高。 # 但有用户报告遇到过匹配分值超过1的情况,为了兼容性默认设为1,在WEBUI中调整范围为0-2 SCORE_THRESHOLD = 1.0 # zsj:增加重复文档相似度阈值,0-1取值,阈值越大越相似 DUPLICATE_THRESHOLD = 0.98 # 默认搜索引擎。可选:bing, duckduckgo, metaphor # DEFAULT_SEARCH_ENGINE = "duckduckgo" DEFAULT_SEARCH_ENGINE = "duckduckgo" # 本地未部署 KGO 搜索时用 duckduckgo;自建搜索后再改为 kgo kgo_search_url = r"http://127.0.0.1:10326/search/search" # 若部署 KGO 搜索服务可改端口 kgo_professional_search_url = r"http://127.0.0.1:8327/search/professionalSearch" # 画图接口 realistic_url = r"http://127.0.0.1:5000/generate" ink_url = r"http://127.0.0.1:5000/generate-image" # mysql配置 ck_mysql_config = { "host": "127.0.0.1", "port": 33306, "user": "root", "password": "1234567890", "database": "chat_gpt_yj", "charset": "utf8mb4", } # 搜索引擎匹配结题数量 SEARCH_ENGINE_TOP_K = 3 DOWNLOAD_HOST_CK = r"http://127.0.0.1:8099/chat_web_backend" KB_ROOT_PATH2 = Path(__file__).resolve().parent.parent / "knowledge_base" # 相关度对比接口 similarity_url = r"http://127.0.0.1:5000/similar" similarity_score = 0.4 # 知识库搜索相关度阈值 similarity_internet = 0.5 # 互联网搜索相关度阈值 # Bing 搜索必备变量 # 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search # 具体申请方式请见 # https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource # 使用python创建bing api 搜索实例详见: # https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" # 注意不是bing Webmaster Tools的api key, # 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out # 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG BING_SUBSCRIPTION_KEY = "" # metaphor搜索需要KEY METAPHOR_API_KEY = "e09d3cdd-e7e1-41d7-a419-6b298002d921" # 心知天气 API KEY,用于天气Agent。申请:https://www.seniverse.com/ SENIVERSE_API_KEY = "STNmmw0iUKB96PNpJ" # 是否开启中文标题加强,以及标题增强的相关配置 # 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; # 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。 ZH_TITLE_ENHANCE = False # PDF OCR 控制:只对宽高超过页面一定比例(图片宽/页面宽,图片高/页面高)的图片进行 OCR。 # 这样可以避免 PDF 中一些小图片的干扰,提高非扫描版 PDF 处理速度 PDF_OCR_THRESHOLD = (0.6, 0.6) # 每个知识库的初始化介绍,用于在初始化知识库时显示和Agent调用,没写则没有介绍,不会被Agent调用。 KB_INFO = { "知识库名称": "知识库介绍", "samples": "关于本项目issue的解答", } # 个人知识库配置 SELF_SCORE_THRESHOLD = 1.9 SELF_TOP_K = 5 SELF_TEMPERATURE = 0.3 SELF_MAX_TOKENS = 8192 SELF_USE_RERANKER = False # 使用milvus不需要rerank,因为milvus已经给召回结果添加了评分排序 GENERATED_IMAGES_BASE_PATH = "/home/albert/workspaces/modelSpaces/models/text_to_pic/generated_images" IMAGE_SERVER_URL_TEMPLATE = "http://127.0.0.1:8099/chat_web_backend/get-image?file_name={}" KB_CHAT_TEMP_DIR = "/home/albert/workspaces/modelSpaces/models/tmp" # 谷歌浏览器存放地址 页面数据抓取 CHROME_DIR = "/home/albert/workspaces/modelSpaces/models/chrome" # 通常情况下不需要更改以下内容 # 知识库默认存储路径 KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base") if not os.path.exists(KB_ROOT_PATH): os.mkdir(KB_ROOT_PATH) # PDF→Markdown 外部微服务(file_converter.pdf_to_html)。 # 客户端切勿使用 http://0.0.0.0:...:0.0.0.0 仅用于 bind,作为请求 URL 时 Host 异常,服务端常返回 403。 def _normalize_pdf_convert_api_url(raw: str) -> str: u = (raw or "").strip() or "http://127.0.0.1:6006/convert/" u = u.replace("0.0.0.0", "127.0.0.1") return u PDF_CONVERT_API_URL = _normalize_pdf_convert_api_url(os.environ.get("PDF_CONVERT_API_URL", "http://127.0.0.1:6006/convert/")) # 传给转换服务的 pdf_path 相对此目录;需与微服务进程能读到的知识库根路径一致(不一致时设环境变量 PDF_CONVERT_KB_ROOT) PDF_CONVERT_KB_ROOT = os.path.abspath(os.environ.get("PDF_CONVERT_KB_ROOT", KB_ROOT_PATH)) def get_pdf_convert_api_url() -> str: """ 在发起 HTTP 请求前调用(勿仅用模块级 PDF_CONVERT_API_URL): 优先读当前进程的 PDF_CONVERT_API_URL 环境变量,避免子进程/旧 .pyc 仍缓存 http://0.0.0.0:6006。 """ env = os.environ.get("PDF_CONVERT_API_URL", "").strip() base = env if env else PDF_CONVERT_API_URL return _normalize_pdf_convert_api_url(str(base)) # 数据库默认存储路径。 # 如果使用sqlite,可以直接修改DB_ROOT_PATH;如果使用其它数据库,请直接修改SQLALCHEMY_DATABASE_URI。 DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db") SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}" # 可选向量库类型及对应配置 kbs_config = { "faiss": { }, "milvus": { "host": "127.0.0.1", "port": "19530", "user": "", "password": "", "db_name": "default", "secure": False, }, "zilliz": { "host": "127.0.0.1", "port": "19530", "user": "", "password": "", "secure": False, }, "pg": { "connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat", }, "es": { "host": "127.0.0.1", "port": "9200", "index_name": "test_index", "user": "", "password": "" }, "milvus_kwargs":{ "search_params":{"metric_type": "L2"}, #在此处增加search_params "index_params":{"metric_type": "L2","index_type": "HNSW"} # 在此处增加index_params }, "chromadb": {} } # TextSplitter配置项,如果你不明白其中的含义,就不要修改。 text_splitter_dict = { "ChineseRecursiveParagraphSplitter": { "source": "huggingface", # 选择tiktoken则使用openai的方法 "tokenizer_name_or_path": "", }, "ChineseRecursiveTextSplitter": { #"source": "huggingface", # 选择tiktoken则使用openai的方法 "source": "no_tokenizer", # 选择tiktoken则使用openai的方法 "tokenizer_name_or_path": "", }, "SpacyTextSplitter": { "source": "huggingface", "tokenizer_name_or_path": "gpt2", }, "RecursiveCharacterTextSplitter": { "source": "tiktoken", "tokenizer_name_or_path": "cl100k_base", }, # "GCYMarkdownTextSplitter": { "MarkdownTextSplitter": { "headers_to_split_on": [ ("#", "head1"), ("##", "head2"), ("###", "head3"), ("####", "head4"), ] }, } TEXT_SPLITTER_MAP = { # "GCYMarkdownTextSplitter": ['.md'], "MarkdownTextSplitter": ['.md'], "ChineseRecursiveTextSplitter": ['.docx', '.doc', '.html'], } # TEXT_SPLITTER 名称 TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter" # Embedding模型定制词语的词表文件 EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt" # 在新增知识库常量定义之后,扩充 CH_BASE_NAME 与 EN_BASE_NAME CH_BASE_NAME = CH_BASE_NAME + ( YJ_CH_JOURNAL_BASE_NAME, YJ_NEWS_BASE_NAME, YJ_FOR_JOURNAL_BASE_NAME, YJ_OA_JOURNAL_BASE_NAME, YJ_POLICYS_BASE_NAME, ) EN_BASE_NAME.extend([ YJ_CH_JOURNAL_BASE, YJ_NEWS_BASE, YJ_FOR_JOURNAL_BASE, YJ_OA_JOURNAL_BASE, YJ_POLICYS_BASE, ]) # ********** 中国钢铁行业动态库(新增) ********** # STEEL_KB = "steel_kb" STEEL_KB = DEFAULT_KNOWLEDGE_BASE #"steel_kb_token_chunk" STEEL_KB_NAME = "中国钢铁行业动态库" # 更新中文名元组和英文名列表 CH_BASE_NAME = CH_BASE_NAME + (STEEL_KB_NAME,) EN_BASE_NAME.append(STEEL_KB)