[全量] 初始化项目代码、配置、文档及Agent协同harness

This commit is contained in:
2026-04-02 11:36:05 +08:00
parent 0553309cdf
commit 87e571d9ec
1133 changed files with 221948 additions and 0 deletions

View File

@@ -0,0 +1,171 @@
import re
from typing import List
from pydantic import BaseModel, Field
from typing_extensions import Literal
from configs.kb_config import CHUNK_SIZE, OVERLAP_SIZE
class Document(BaseModel):
page_content: str
metadata: dict = Field(default_factory=dict)
type: Literal["Document"] = "Document"
class MarkdownTextSplitter:
def __init__(self, headers_to_split_on: List[str] = None, **kwargs):
self.chunk_size = CHUNK_SIZE
self.overlap_size = OVERLAP_SIZE
self.headers_to_split_on = headers_to_split_on or ["#", "##", "###", "####"]
def clean_text(self, text: str) -> str:
"""
清理文本中的特殊符号,如 \n\t\\n 等,及图片格式链接,如 ![](image_path)
"""
# 去除 \n、\t、\\n 等多余的特殊符号
text = text.replace("\n", " ").replace("\t", " ").replace("\\n", " ").strip()
# 正则匹配 Markdown 图片格式并清除,例如 ![](image_path)
text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
return text
def split_text_by_headers(self, markdown_document: str):
"""
使用正则表达式将 Markdown 文档根据标题分段
"""
header_pattern = r"^(#{1,6})\s+(.*)$" # 捕获所有级别的标题(从 # 到 ######
sections = []
current_header = None
current_content = []
current_header_level = 0
# 按行处理 markdown 文档
for line in markdown_document.split("\n"):
match = re.match(header_pattern, line)
if match:
# 如果找到标题,处理之前的部分
if current_header:
sections.append((current_header, current_header_level, "\n".join(current_content)))
# 更新标题和内容
current_header = match.group(2).strip() # 标题内容
current_header_level = len(match.group(1)) # 标题级别,# 代表 h1, ## 代表 h2 等
current_content = []
else:
# 否则将该行加入当前内容
current_content.append(self.clean_text(line))
# 添加最后一个部分
if current_header:
sections.append((current_header, current_header_level, "\n".join(current_content)))
return sections
def split_paragraphs(self, content: str) -> List[str]:
"""
按照 chunk_size 将文本分段,保持语句完整,尽量在句子结束处分段。
分段规则:按句号、换行符分割,且每段字数不超过 chunk_size。若没有合适的标点符号则强制截断。
"""
paragraphs = []
current_paragraph = ""
# 用正则按句子结束符(句号、问号、感叹号等)分割文本
sentences = re.split(r'([。!?])', content) # 捕获句子结尾符
# 将分割后的句子和句末标点符号重新拼接成完整的句子
sentences = [s.strip() + (sentences[i + 1] if i + 1 < len(sentences) else '')
for i, s in enumerate(sentences) if i % 2 == 0]
# 遍历所有句子
for sentence in sentences:
sentence = self.clean_text(sentence)
# 判断当前句子是否加入到当前段落
if len(current_paragraph) + len(sentence) + 1 <= self.chunk_size:
# 如果加入后不超过最大字数,继续添加到当前段落
current_paragraph += (" " + sentence) if current_paragraph else sentence
else:
# 如果当前段落已超过字数限制,则检查是否能在当前句子末尾进行分割
if len(current_paragraph) > self.chunk_size:
paragraphs.append(current_paragraph[:self.chunk_size]) # 强制截断
current_paragraph = current_paragraph[self.chunk_size:] # 剩余部分移到下一个段落
# 处理当前句子
if len(sentence) > self.chunk_size:
# 如果单个句子超过 chunk_size则强制截断
while len(sentence) > self.chunk_size:
paragraphs.append(sentence[:self.chunk_size])
sentence = sentence[self.chunk_size:]
# 最后将剩余的句子添加到当前段落
current_paragraph = sentence
# 添加最后一个段落(如果有的话)
if current_paragraph:
paragraphs.append(current_paragraph)
return paragraphs
def split_documents(self, sections: List[str], doc_source) -> List[Document]:
"""
递归分段,根据每个 Markdown 文档部分生成对应的内容
"""
final_splits = []
for header, header_level, content in sections:
# 递归分段每个部分
paragraphs = self.split_paragraphs(content)
# 根据标题级别,将标题存入对应的 h1, h2, h3 等字段
metadata = {"source": doc_source, "header": header}
if header_level == 1:
metadata["h1"] = header
elif header_level == 2:
metadata["h2"] = header
elif header_level == 3:
metadata["h3"] = header
elif header_level == 4:
metadata["h4"] = header
elif header_level == 5:
metadata["h5"] = header
elif header_level == 6:
metadata["h6"] = header
# 对每个段落创建 Document 对象
for paragraph in paragraphs:
doc = Document(
page_content=paragraph,
metadata=metadata
)
final_splits.append(doc)
return final_splits
def split_markdown_text(self, markdown_document: str, doc_source: str) -> List[Document]:
# 首先根据标题分段
sections = self.split_text_by_headers(markdown_document)
# 进一步分段并创建 Document 对象
final_splits = self.split_documents(sections, doc_source)
# 返回最终的文档段落列表
return final_splits
# 示例使用
if __name__ == "__main__":
doc_source = ""
markdown_text = """
# 标题 1
QQQ
## 标题 2
WWW
### 标题 3
EEE
"""
splitter = MarkdownTextSplitter()
splits = splitter.split_markdown_text(markdown_text, doc_source)
for split in splits:
print(f"Header: {split.metadata}, Content: {split.page_content}")

View File

@@ -0,0 +1,7 @@
from .chinese_text_splitter import ChineseTextSplitter
from .ali_text_splitter import AliTextSplitter
from .zh_title_enhance import zh_title_enhance
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from .chinese_recursive_paragraph_splitter import ChineseRecursiveParagraphSplitter
from .GCYMarkdownTextSplitter import GCYMarkdownTextSplitter
from .MarkdownTextSplitter import MarkdownTextSplitter

View File

@@ -0,0 +1,34 @@
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
class AliTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
def split_text(self, text: str) -> List[str]:
# use_document_segmentation参数指定是否用语义切分文档此处采取的文档语义分割模型为达摩院开源的nlp_bert_document-segmentation_chinese-base论文见https://arxiv.org/abs/2107.09278
# 如果使用模型进行文档语义切分那么需要安装modelscope[nlp]pip install "modelscope[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
# 考虑到使用了三个模型可能对于低配置gpu不太友好因此这里将模型load进cpu计算有需要的话可以替换device为自己的显卡id
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
try:
from modelscope.pipelines import pipeline
except ImportError:
raise ImportError(
"Could not import modelscope python package. "
"Please install modelscope with `pip install modelscope`. "
)
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
result = p(documents=text)
sent_list = [i for i in result["text"].split("\n\t") if i]
return sent_list

View File

@@ -0,0 +1,88 @@
import re
from typing import List, Optional, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
logger = logging.getLogger(__name__)
def _split_text_with_regex_from_end(
text: str, separator: str, keep_separator: bool
) -> List[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
if len(_splits) % 2 == 1:
splits += _splits[-1:]
# splits = [_splits[0]] + splits
else:
splits = re.split(separator, text)
else:
splits = list(text)
return [s for s in splits if s != ""]
class ChineseRecursiveParagraphSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
is_separator_regex: bool = True,
**kwargs: Any,
) -> None:
"""Create a new TextSplitter."""
super().__init__(keep_separator=keep_separator, **kwargs)
self._separators = separators or [
"\n\n",
"\n",
'\r\n',
'\r'
]
self._is_separator_regex = is_separator_regex
def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
new_separators = []
for i, _s in enumerate(separators):
_separator = _s if self._is_separator_regex else re.escape(_s)
if _s == "":
separator = _s
break
if re.search(_separator, text):
separator = _s
new_separators = separators[i + 1:]
break
_separator = separator if self._is_separator_regex else re.escape(separator)
splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
final_chunks = splits
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__":
text_splitter = ChineseRecursiveParagraphSplitter(
keep_separator=True,
is_separator_regex=True,
chunk_size=1,
chunk_overlap=0
)
import sys
sys.path.append('../../../GCY-RAG-LangChain-ChatChat/')
filepath = "/home/work/project/test_result.csv"
import document_loaders
loader = document_loaders.RapidOCRCSVLoader(filepath, autodetect_encoding=True)
docs = loader.load()
for inum, text in enumerate(docs):
print(inum)
chunks = text_splitter.split_text(text.page_content)
for idx, chunk in enumerate(chunks):
print(f'///////////////////////// idx:{idx} //////////////////////////')
print(len(chunk))

View File

@@ -0,0 +1,127 @@
import re
from typing import List, Optional, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
logger = logging.getLogger(__name__)
def _split_text_with_regex_from_end(
text: str, separator: str, keep_separator: bool
) -> List[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
if len(_splits) % 2 == 1:
splits += _splits[-1:]
# splits = [_splits[0]] + splits
else:
splits = re.split(separator, text)
else:
splits = list(text)
return [s for s in splits if s != ""]
class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
is_separator_regex: bool = True,
**kwargs: Any,
) -> None:
"""Create a new TextSplitter."""
super().__init__(keep_separator=keep_separator, **kwargs)
self._separators = separators or [
"\n\n",
"\n",
"。||",
"\.\s|\!\s|\?\s",
"|;\s",
"|,\s"
]
self._is_separator_regex = is_separator_regex
def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
new_separators = []
for i, _s in enumerate(separators):
_separator = _s if self._is_separator_regex else re.escape(_s)
if _s == "":
separator = _s
break
if re.search(_separator, text):
separator = _s
new_separators = separators[i + 1:]
break
_separator = separator if self._is_separator_regex else re.escape(separator)
splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
# Now go merging things, recursively splitting longer texts.
_good_splits = []
_separator = "" if self._keep_separator else separator
for s in splits:
if self._length_function(s) < self._chunk_size:
_good_splits.append(s)
else:
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
_good_splits = []
if not new_separators:
final_chunks.append(s)
else:
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__":
text_splitter = ChineseRecursiveTextSplitter(
keep_separator=True,
is_separator_regex=True,
chunk_size=100,
chunk_overlap=0
)
ls = [
"""中国对外贸易形势报告75页
前 10 个月,一般贸易进出口 19.5 万亿元,增长 25.1% 比整体进出口增速高出 2.9 个百分点,
占进出口总额的 61.7%,较去年同期提升 1.6 个百分点。其中,一般贸易出口 10.6 万亿元,增长 25.3%
占出口总额的 60.9%,提升 1.5 个百分点进口8.9万亿元增长24.9%占进口总额的62.7% 提升 1.8 个百分点。
加工贸易进出口 6.8 万亿元,增长 11.8% 占进出口总额的 21.5%,减少 2.0 个百分点。其中,出口增 长 10.4%
占出口总额的 24.3%,减少 2.6 个百分点;进口增 长 14.2%,占进口总额的 18.0%,减少 1.2 个百分点。
此外, 以保税物流方式进出口 3.96 万亿元,增长 27.9%
其中,出 口 1.47 万亿元,增长 38.9%;进口 2.49 万亿元,增长 22.2%
前三季度,中国服务贸易继续保持快速增长态势。服务 进出口总额 37834.3 亿元,增长 11.6%
其中服务出口 17820.9 亿元,增长 27.3%;进口 20013.4 亿元,增长 0.5%,进口增 速实现了疫情以来的首次转正。
服务出口增幅大于进口 26.8 个百分点,带动服务贸易逆差下降 62.9%至 2192.5 亿元。
""",
"""
服 务贸易结构持续优化,知识密集型服务进出口 16917.7 亿元, 增长 13.3%,占服务进出口总额的比重达到 44.7%
提升 0.7 个百分点。 二、中国对外贸易发展环境分析和展望 全球疫情起伏反复,经济复苏分化加剧,
大宗商品价格 上涨、能源紧缺、运力紧张及发达经济体政策调整外溢等风 险交织叠加。同时也要看到,
我国经济长期向好的趋势没有 改变,外贸企业韧性和活力不断增强,新业态新模式加快发 展,创新转型步伐提速。
产业链供应链面临挑战。美欧等加快出台制造业回迁计 划,加速产业链供应链本土布局,跨国公司调整产业链供应 链,
全球双链面临新一轮重构,区域化、近岸化、本土化、 短链化趋势凸显。疫苗供应不足,制造业“缺芯”、
物流受限、 运价高企,全球产业链供应链面临压力。 全球通胀持续高位运行。能源价格上涨加大主要经济体 的通胀压力,
增加全球经济复苏的不确定性。世界银行今年 10 月发布《大宗商品市场展望》指出,能源价格在 2021 年 大涨逾 80%
并且仍将在 2022 年小幅上涨。IMF 指出,全 球通胀上行风险加剧,通胀前景存在巨大不确定性。
"""
]
# text = """"""
for inum, text in enumerate(ls):
print(inum)
chunks = text_splitter.split_text(text)
for idx, chunk in enumerate(chunks):
print(f'!!!!!!!!!!!!!!!!!!!!!idx:{idx}')
print(chunk)

View File

@@ -0,0 +1,59 @@
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = 250, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", "\n", text)
text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "")
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
sent_list = []
for ele in sent_sep_pattern.split(text):
if sent_sep_pattern.match(ele) and sent_list:
sent_list[-1] += ele
elif ele:
sent_list.append(ele)
return sent_list
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
return ls

View File

@@ -0,0 +1,99 @@
from langchain.docstore.document import Document
import re
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
as a title or narrative text. The ratio does not count spaces.
Parameters
----------
text
The input string to test
threshold
If the proportion of non-alpha characters exceeds this threshold, the function
returns False
"""
if len(text) == 0:
return False
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
total_count = len([char for char in text if char.strip()])
try:
ratio = alpha_count / total_count
return ratio < threshold
except:
return False
def is_possible_title(
text: str,
title_max_word_length: int = 20,
non_alpha_threshold: float = 0.5,
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
Parameters
----------
text
The input text to check
title_max_word_length
The maximum number of words a title can contain
non_alpha_threshold
The minimum number of alpha characters the text needs to be considered a title
"""
# 文本长度为0的话肯定不是title
if len(text) == 0:
print("Not a title. Text is empty.")
return False
# 文本中有标点符号就不是title
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
if ENDS_IN_PUNCT_RE.search(text) is not None:
return False
# 文本长度不能超过设定值默认20
# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
# is less expensive and actual tokenization doesn't add much value for the length check
if len(text) > title_max_word_length:
return False
# 文本中数字的占比不能太高否则不是title
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
if text.endswith((",", ".", "", "")):
return False
if text.isnumeric():
print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
return False
# 开头的字符内应该有数字默认5个字符内
if len(text) < 5:
text_5 = text
else:
text_5 = text[:5]
alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
if not alpha_in_text_5:
return False
return True
def zh_title_enhance(docs: Document) -> Document:
title = None
if len(docs) > 0:
for doc in docs:
if is_possible_title(doc.page_content):
doc.metadata['category'] = 'cn_Title'
title = doc.page_content
elif title:
doc.page_content = f"下文与({title})有关。{doc.page_content}"
return docs
else:
print("文件不存在")