Files
gangyan/langchain-chat/configs/kb_config.py

307 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from pathlib import Path
import re
# 默认使用的知识库
# DEFAULT_KNOWLEDGE_BASE = "t_policy_total_bge_v1"
SELF_KNOWLEDGE_BASE = re.compile(r'^p_.*') # 个人知识库名称
DEFAULT_KNOWLEDGE_BASE = "t_policy_total_bge_new_v2"
DEFAULT_POLICY_BASE = DEFAULT_KNOWLEDGE_BASE #"t_policy_total_bge_new_v2"
DEFAULT_POLICY_BASE_NAME = "政策库"
DEFAULT_REPORT_BASE1 = DEFAULT_KNOWLEDGE_BASE # "t_strategy_report_bge_v2"
DEFAULT_REPORT_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_report_v2"
DEFAULT_REPORT_BASE_NAME = "报告库"
DEFAULT_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"t_journal_article_bge_v1"
DEFAULT_JOURNAL_BASE_NAME = "期刊论文库"
GY_NEWS_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_news_v2"
GY_NEWS_BASE_NAME = "冶金行业新闻库"
GY_REPORT_BASE = DEFAULT_KNOWLEDGE_BASE #"gydemo_report_v2"
GY_REPORT_BASE_NAME = "冶金行业报告库"
GY_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"gy_demo_journal_v3"
GY_JOURNAL_BASE_NAME = "冶金专业知识库"
OLD_POLICY_BASE = ['t_policy_total_bge_v1','t_policy_total_bge_new_v1']
# 默认向量库/全文检索引擎类型。可选faiss, milvus(离线) & zilliz(在线), pgvector, chromadb 全文检索引擎es
DEFAULT_VS_TYPE = "milvus"
# 新增冶金系列知识库常量
YJ_CH_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_ch_journal_bge_v1_recover"
YJ_CH_JOURNAL_BASE_NAME = "冶金中文期刊库"
YJ_NEWS_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_news_bge_v1_recover"
YJ_NEWS_BASE_NAME = "冶金新闻库2024年以及之前"
YJ_FOR_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_for_journal_bge_v1_recover"
YJ_FOR_JOURNAL_BASE_NAME = "冶金外文期刊库"
YJ_OA_JOURNAL_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_oa_journal_bge_v2_recover"
YJ_OA_JOURNAL_BASE_NAME = "冶金OA期刊库"
YJ_POLICYS_BASE = DEFAULT_KNOWLEDGE_BASE #"yj_policys_bge_v1_recover"
YJ_POLICYS_BASE_NAME = "冶金政策库"
# 专门的冶金新闻/期刊/报告可在此追加,如有需要
YJ_BASE_NAME = [YJ_CH_JOURNAL_BASE_NAME, YJ_NEWS_BASE_NAME, YJ_FOR_JOURNAL_BASE_NAME, YJ_OA_JOURNAL_BASE_NAME, YJ_POLICYS_BASE_NAME]
CH_BASE_NAME = (
DEFAULT_POLICY_BASE_NAME,
DEFAULT_JOURNAL_BASE_NAME,
GY_NEWS_BASE_NAME,
GY_REPORT_BASE_NAME,
GY_JOURNAL_BASE_NAME,
DEFAULT_REPORT_BASE_NAME,
)
EN_BASE_NAME = [
DEFAULT_POLICY_BASE,
DEFAULT_REPORT_BASE,
DEFAULT_REPORT_BASE1,
DEFAULT_JOURNAL_BASE,
GY_NEWS_BASE,
GY_REPORT_BASE,
GY_JOURNAL_BASE,
YJ_CH_JOURNAL_BASE,
YJ_NEWS_BASE,
YJ_FOR_JOURNAL_BASE,
YJ_OA_JOURNAL_BASE,
YJ_POLICYS_BASE,
]
GY_BASE_NAME = [GY_NEWS_BASE,GY_REPORT_BASE,GY_JOURNAL_BASE]
OLD_JOURNAL_BASE = ['t_journal_article_bge_v0']
# 缓存向量库数量针对FAISS
CACHED_VS_NUM = 1
# 缓存临时向量库数量针对FAISS用于文件对话
CACHED_MEMO_VS_NUM = 10
# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
CHUNK_SIZE = 250
# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
OVERLAP_SIZE = 50
# 知识库匹配向量数量
VECTOR_SEARCH_TOP_K = 5
# 知识库匹配的距离阈值一般取值范围在0-1之间SCORE越小距离越小从而相关度越高。
# 但有用户报告遇到过匹配分值超过1的情况为了兼容性默认设为1在WEBUI中调整范围为0-2
SCORE_THRESHOLD = 1.0
# zsj增加重复文档相似度阈值0-1取值阈值越大越相似
DUPLICATE_THRESHOLD = 0.98
# 默认搜索引擎。可选bing, duckduckgo, metaphor
# DEFAULT_SEARCH_ENGINE = "duckduckgo"
DEFAULT_SEARCH_ENGINE = "duckduckgo" # 本地未部署 KGO 搜索时用 duckduckgo自建搜索后再改为 kgo
kgo_search_url = r"http://192.168.203.21:8326/search/search" # 若部署 KGO 搜索服务可改端口
kgo_professional_search_url = r"http://192.168.203.21:8326/search/professionalSearch"
# 画图接口
realistic_url = r"http://127.0.0.1:5000/generate"
ink_url = r"http://127.0.0.1:5000/generate-image"
# mysql配置
ck_mysql_config = {
"host": "127.0.0.1",
"port": 33306,
"user": "root",
"password": "1234567890",
"database": "chat_gpt_yj",
"charset": "utf8mb4",
}
# 搜索引擎匹配结题数量
SEARCH_ENGINE_TOP_K = 3
DOWNLOAD_HOST_CK = r"http://127.0.0.1:8099/chat_web_backend"
KB_ROOT_PATH2 = Path(__file__).resolve().parent.parent / "knowledge_base"
# 相关度对比接口
similarity_url = r"http://127.0.0.1:5000/similar"
similarity_score = 0.4 # 知识库搜索相关度阈值
similarity_internet = 0.5 # 互联网搜索相关度阈值
# Bing 搜索必备变量
# 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search
# 具体申请方式请见
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource
# 使用python创建bing api 搜索实例详见:
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python
BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
# 注意不是bing Webmaster Tools的api key
# 此外如果是在服务器上报Failed to establish a new connection: [Errno 110] Connection timed out
# 是因为服务器加了防火墙需要联系管理员加白名单如果公司的服务器的话就别想了GG
BING_SUBSCRIPTION_KEY = ""
# metaphor搜索需要KEY
METAPHOR_API_KEY = "e09d3cdd-e7e1-41d7-a419-6b298002d921"
# 心知天气 API KEY用于天气Agent。申请https://www.seniverse.com/
SENIVERSE_API_KEY = "STNmmw0iUKB96PNpJ"
# 是否开启中文标题加强,以及标题增强的相关配置
# 通过增加标题判断判断哪些文本为标题并在metadata中进行标记
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
ZH_TITLE_ENHANCE = False
# PDF OCR 控制:只对宽高超过页面一定比例(图片宽/页面宽,图片高/页面高)的图片进行 OCR。
# 这样可以避免 PDF 中一些小图片的干扰,提高非扫描版 PDF 处理速度
PDF_OCR_THRESHOLD = (0.6, 0.6)
# 每个知识库的初始化介绍用于在初始化知识库时显示和Agent调用没写则没有介绍不会被Agent调用。
KB_INFO = {
"知识库名称": "知识库介绍",
"samples": "关于本项目issue的解答",
}
# 个人知识库配置
SELF_SCORE_THRESHOLD = 1.9
SELF_TOP_K = 5
SELF_TEMPERATURE = 0.3
SELF_MAX_TOKENS = 8192
SELF_USE_RERANKER = False # 使用milvus不需要rerank因为milvus已经给召回结果添加了评分排序
GENERATED_IMAGES_BASE_PATH = "/home/albert/workspaces/modelSpaces/models/text_to_pic/generated_images"
IMAGE_SERVER_URL_TEMPLATE = "http://127.0.0.1:8099/chat_web_backend/get-image?file_name={}"
KB_CHAT_TEMP_DIR = "/home/albert/workspaces/modelSpaces/models/tmp"
# 谷歌浏览器存放地址 页面数据抓取
CHROME_DIR = "/home/albert/workspaces/modelSpaces/models/chrome"
# 通常情况下不需要更改以下内容
# 知识库默认存储路径
KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
if not os.path.exists(KB_ROOT_PATH):
os.mkdir(KB_ROOT_PATH)
# PDF→Markdown 外部微服务file_converter.pdf_to_html
# 客户端切勿使用 http://0.0.0.0:...0.0.0.0 仅用于 bind作为请求 URL 时 Host 异常,服务端常返回 403。
def _normalize_pdf_convert_api_url(raw: str) -> str:
u = (raw or "").strip() or "http://127.0.0.1:6006/convert/"
u = u.replace("0.0.0.0", "127.0.0.1")
return u
PDF_CONVERT_API_URL = _normalize_pdf_convert_api_url(os.environ.get("PDF_CONVERT_API_URL", "http://127.0.0.1:6006/convert/"))
# 传给转换服务的 pdf_path 相对此目录;需与微服务进程能读到的知识库根路径一致(不一致时设环境变量 PDF_CONVERT_KB_ROOT
PDF_CONVERT_KB_ROOT = os.path.abspath(os.environ.get("PDF_CONVERT_KB_ROOT", KB_ROOT_PATH))
def get_pdf_convert_api_url() -> str:
"""
在发起 HTTP 请求前调用(勿仅用模块级 PDF_CONVERT_API_URL
优先读当前进程的 PDF_CONVERT_API_URL 环境变量,避免子进程/旧 .pyc 仍缓存 http://0.0.0.0:6006。
"""
env = os.environ.get("PDF_CONVERT_API_URL", "").strip()
base = env if env else PDF_CONVERT_API_URL
return _normalize_pdf_convert_api_url(str(base))
# 数据库默认存储路径。
# 如果使用sqlite可以直接修改DB_ROOT_PATH如果使用其它数据库请直接修改SQLALCHEMY_DATABASE_URI。
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
# 可选向量库类型及对应配置
kbs_config = {
"faiss": {
},
"milvus": {
"host": "127.0.0.1",
"port": "19530",
"user": "",
"password": "",
"db_name": "default",
"secure": False,
},
"zilliz": {
"host": "127.0.0.1",
"port": "19530",
"user": "",
"password": "",
"secure": False,
},
"pg": {
"connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat",
},
"es": {
"host": "127.0.0.1",
"port": "9200",
"index_name": "test_index",
"user": "",
"password": ""
},
"milvus_kwargs":{
"search_params":{"metric_type": "L2"}, #在此处增加search_params
"index_params":{"metric_type": "L2","index_type": "HNSW"} # 在此处增加index_params
},
"chromadb": {}
}
# TextSplitter配置项如果你不明白其中的含义就不要修改。
text_splitter_dict = {
"ChineseRecursiveParagraphSplitter": {
"source": "huggingface", # 选择tiktoken则使用openai的方法
"tokenizer_name_or_path": "",
},
"ChineseRecursiveTextSplitter": {
#"source": "huggingface", # 选择tiktoken则使用openai的方法
"source": "no_tokenizer", # 选择tiktoken则使用openai的方法
"tokenizer_name_or_path": "",
},
"SpacyTextSplitter": {
"source": "huggingface",
"tokenizer_name_or_path": "gpt2",
},
"RecursiveCharacterTextSplitter": {
"source": "tiktoken",
"tokenizer_name_or_path": "cl100k_base",
},
# "GCYMarkdownTextSplitter": {
"MarkdownTextSplitter": {
"headers_to_split_on":
[
("#", "head1"),
("##", "head2"),
("###", "head3"),
("####", "head4"),
]
},
}
TEXT_SPLITTER_MAP = {
# "GCYMarkdownTextSplitter": ['.md'],
"MarkdownTextSplitter": ['.md'],
"ChineseRecursiveTextSplitter": ['.docx', '.doc', '.html'],
}
# TEXT_SPLITTER 名称
TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter"
# Embedding模型定制词语的词表文件
EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt"
# 在新增知识库常量定义之后,扩充 CH_BASE_NAME 与 EN_BASE_NAME
CH_BASE_NAME = CH_BASE_NAME + (
YJ_CH_JOURNAL_BASE_NAME,
YJ_NEWS_BASE_NAME,
YJ_FOR_JOURNAL_BASE_NAME,
YJ_OA_JOURNAL_BASE_NAME,
YJ_POLICYS_BASE_NAME,
)
EN_BASE_NAME.extend([
YJ_CH_JOURNAL_BASE,
YJ_NEWS_BASE,
YJ_FOR_JOURNAL_BASE,
YJ_OA_JOURNAL_BASE,
YJ_POLICYS_BASE,
])
# ********** 中国钢铁行业动态库(新增) **********
# STEEL_KB = "steel_kb"
STEEL_KB = DEFAULT_KNOWLEDGE_BASE #"steel_kb_token_chunk"
STEEL_KB_NAME = "中国钢铁行业动态库"
# 更新中文名元组和英文名列表
CH_BASE_NAME = CH_BASE_NAME + (STEEL_KB_NAME,)
EN_BASE_NAME.append(STEEL_KB)