[全量] 初始化项目代码、配置、文档及Agent协同harness

2026-04-02 11:36:05 +08:00
parent 0553309cdf
commit 87e571d9ec
1133 changed files with 221948 additions and 0 deletions
--- a/langchain-chat/text_splitter/GCYMarkdownTextSplitter.cpython-311-x86_64-linux-gnu.so
+++ b/langchain-chat/text_splitter/GCYMarkdownTextSplitter.cpython-311-x86_64-linux-gnu.so
--- a/langchain-chat/text_splitter/MarkdownTextSplitter.py
+++ b/langchain-chat/text_splitter/MarkdownTextSplitter.py
@@ -0,0 +1,171 @@
+import re
+from typing import List
+from pydantic import BaseModel, Field
+from typing_extensions import Literal
+
+from configs.kb_config import CHUNK_SIZE, OVERLAP_SIZE
+
+
+class Document(BaseModel):
+    page_content: str
+    metadata: dict = Field(default_factory=dict)
+    type: Literal["Document"] = "Document"
+
+
+class MarkdownTextSplitter:
+    def __init__(self, headers_to_split_on: List[str] = None, **kwargs):
+        self.chunk_size = CHUNK_SIZE
+        self.overlap_size = OVERLAP_SIZE
+        self.headers_to_split_on = headers_to_split_on or ["#", "##", "###", "####"]
+
+    def clean_text(self, text: str) -> str:
+        """
+        清理文本中的特殊符号，如 \n、\t、\\n 等，及图片格式链接，如 ![](image_path)
+        """
+        # 去除 \n、\t、\\n 等多余的特殊符号
+        text = text.replace("\n", " ").replace("\t", " ").replace("\\n", " ").strip()
+
+        # 正则匹配 Markdown 图片格式并清除，例如 ![](image_path)
+        text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
+
+        return text
+    def split_text_by_headers(self, markdown_document: str):
+        """
+        使用正则表达式将 Markdown 文档根据标题分段
+        """
+        header_pattern = r"^(#{1,6})\s+(.*)$"  # 捕获所有级别的标题（从 # 到 ######）
+        sections = []
+        current_header = None
+        current_content = []
+        current_header_level = 0
+
+        # 按行处理 markdown 文档
+        for line in markdown_document.split("\n"):
+            match = re.match(header_pattern, line)
+            if match:
+                # 如果找到标题，处理之前的部分
+                if current_header:
+                    sections.append((current_header, current_header_level, "\n".join(current_content)))
+                # 更新标题和内容
+                current_header = match.group(2).strip()  # 标题内容
+                current_header_level = len(match.group(1))  # 标题级别，# 代表 h1, ## 代表 h2 等
+                current_content = []
+            else:
+                # 否则将该行加入当前内容
+                current_content.append(self.clean_text(line))
+
+        # 添加最后一个部分
+        if current_header:
+            sections.append((current_header, current_header_level, "\n".join(current_content)))
+        
+        return sections
+
+    def split_paragraphs(self, content: str) -> List[str]:
+        """
+        按照 chunk_size 将文本分段，保持语句完整，尽量在句子结束处分段。
+        分段规则：按句号、换行符分割，且每段字数不超过 chunk_size。若没有合适的标点符号，则强制截断。
+        """
+        paragraphs = []
+        current_paragraph = ""
+
+        # 用正则按句子结束符（句号、问号、感叹号等）分割文本
+        sentences = re.split(r'([。！？])', content)  # 捕获句子结尾符
+
+        # 将分割后的句子和句末标点符号重新拼接成完整的句子
+        sentences = [s.strip() + (sentences[i + 1] if i + 1 < len(sentences) else '')
+                     for i, s in enumerate(sentences) if i % 2 == 0]
+
+        # 遍历所有句子
+        for sentence in sentences:
+            sentence = self.clean_text(sentence)
+            
+            # 判断当前句子是否加入到当前段落
+            if len(current_paragraph) + len(sentence) + 1 <= self.chunk_size:
+                # 如果加入后不超过最大字数，继续添加到当前段落
+                current_paragraph += (" " + sentence) if current_paragraph else sentence
+            else:
+                # 如果当前段落已超过字数限制，则检查是否能在当前句子末尾进行分割
+                if len(current_paragraph) > self.chunk_size:
+                    paragraphs.append(current_paragraph[:self.chunk_size])  # 强制截断
+                    current_paragraph = current_paragraph[self.chunk_size:]  # 剩余部分移到下一个段落
+                
+                # 处理当前句子
+                if len(sentence) > self.chunk_size:
+                    # 如果单个句子超过 chunk_size，则强制截断
+                    while len(sentence) > self.chunk_size:
+                        paragraphs.append(sentence[:self.chunk_size])
+                        sentence = sentence[self.chunk_size:]
+                
+                # 最后将剩余的句子添加到当前段落
+                current_paragraph = sentence
+
+        # 添加最后一个段落（如果有的话）
+        if current_paragraph:
+            paragraphs.append(current_paragraph)
+
+        return paragraphs
+    def split_documents(self, sections: List[str], doc_source) -> List[Document]:
+        """
+        递归分段，根据每个 Markdown 文档部分生成对应的内容
+        """
+        final_splits = []
+
+        for header, header_level, content in sections:
+            # 递归分段每个部分
+            paragraphs = self.split_paragraphs(content)
+
+            # 根据标题级别，将标题存入对应的 h1, h2, h3 等字段
+            metadata = {"source": doc_source, "header": header}
+            if header_level == 1:
+                metadata["h1"] = header
+            elif header_level == 2:
+                metadata["h2"] = header
+            elif header_level == 3:
+                metadata["h3"] = header
+            elif header_level == 4:
+                metadata["h4"] = header
+            elif header_level == 5:
+                metadata["h5"] = header
+            elif header_level == 6:
+                metadata["h6"] = header
+
+            # 对每个段落创建 Document 对象
+            for paragraph in paragraphs:
+                doc = Document(
+                    page_content=paragraph,
+                    metadata=metadata
+                )
+                final_splits.append(doc)
+
+        return final_splits
+
+    def split_markdown_text(self, markdown_document: str, doc_source: str) -> List[Document]:
+
+        # 首先根据标题分段
+        sections = self.split_text_by_headers(markdown_document)
+        
+        # 进一步分段并创建 Document 对象
+        final_splits = self.split_documents(sections, doc_source)
+        
+        # 返回最终的文档段落列表
+        return final_splits
+
+
+# 示例使用
+if __name__ == "__main__":
+    doc_source = ""
+    markdown_text = """
+    # 标题 1
+    QQQ
+    
+    ## 标题 2
+    WWW
+    
+    ### 标题 3
+    EEE
+    """
+    splitter = MarkdownTextSplitter()
+    splits = splitter.split_markdown_text(markdown_text, doc_source)
+    
+    for split in splits:
+        print(f"Header: {split.metadata}, Content: {split.page_content}")
--- a/langchain-chat/text_splitter/init.py
+++ b/langchain-chat/text_splitter/init.py
@@ -0,0 +1,7 @@
+from .chinese_text_splitter import ChineseTextSplitter
+from .ali_text_splitter import AliTextSplitter
+from .zh_title_enhance import zh_title_enhance
+from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
+from .chinese_recursive_paragraph_splitter import ChineseRecursiveParagraphSplitter
+from .GCYMarkdownTextSplitter import GCYMarkdownTextSplitter
+from .MarkdownTextSplitter import MarkdownTextSplitter
--- a/langchain-chat/text_splitter/ali_text_splitter.py
+++ b/langchain-chat/text_splitter/ali_text_splitter.py
@@ -0,0 +1,34 @@
+from langchain.text_splitter import CharacterTextSplitter
+import re
+from typing import List
+
+
+class AliTextSplitter(CharacterTextSplitter):
+    def __init__(self, pdf: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        self.pdf = pdf
+
+    def split_text(self, text: str) -> List[str]:
+        # use_document_segmentation参数指定是否用语义切分文档，此处采取的文档语义分割模型为达摩院开源的nlp_bert_document-segmentation_chinese-base，论文见https://arxiv.org/abs/2107.09278
+        # 如果使用模型进行文档语义切分，那么需要安装modelscope[nlp]：pip install "modelscope[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+        # 考虑到使用了三个模型，可能对于低配置gpu不太友好，因此这里将模型load进cpu计算，有需要的话可以替换device为自己的显卡id
+        if self.pdf:
+            text = re.sub(r"\n{3,}", r"\n", text)
+            text = re.sub('\s', " ", text)
+            text = re.sub("\n\n", "", text)
+        try:
+            from modelscope.pipelines import pipeline
+        except ImportError:
+            raise ImportError(
+                "Could not import modelscope python package. "
+                "Please install modelscope with `pip install modelscope`. "
+            )
+
+
+        p = pipeline(
+            task="document-segmentation",
+            model='damo/nlp_bert_document-segmentation_chinese-base',
+            device="cpu")
+        result = p(documents=text)
+        sent_list = [i for i in result["text"].split("\n\t") if i]
+        return sent_list
--- a/langchain-chat/text_splitter/chinese_recursive_paragraph_splitter.py
+++ b/langchain-chat/text_splitter/chinese_recursive_paragraph_splitter.py
@@ -0,0 +1,88 @@
+import re
+from typing import List, Optional, Any
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def _split_text_with_regex_from_end(
+        text: str, separator: str, keep_separator: bool
+) -> List[str]:
+    # Now that we have the separator, split the text
+    if separator:
+        if keep_separator:
+            # The parentheses in the pattern keep the delimiters in the result.
+            _splits = re.split(f"({separator})", text)
+            splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+            if len(_splits) % 2 == 1:
+                splits += _splits[-1:]
+            # splits = [_splits[0]] + splits
+        else:
+            splits = re.split(separator, text)
+    else:
+        splits = list(text)
+    return [s for s in splits if s != ""]
+
+
+class ChineseRecursiveParagraphSplitter(RecursiveCharacterTextSplitter):
+    def __init__(
+            self,
+            separators: Optional[List[str]] = None,
+            keep_separator: bool = True,
+            is_separator_regex: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            '\r\n',
+            '\r'
+        ]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1:]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
+        final_chunks = splits
+
+        return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
+
+
+if __name__ == "__main__":
+    text_splitter = ChineseRecursiveParagraphSplitter(
+        keep_separator=True,
+        is_separator_regex=True,
+        chunk_size=1,
+        chunk_overlap=0
+    )
+    import sys
+    sys.path.append('../../../GCY-RAG-LangChain-ChatChat/')
+    filepath = "/home/work/project/test_result.csv"
+    import document_loaders
+    
+    loader = document_loaders.RapidOCRCSVLoader(filepath, autodetect_encoding=True)
+    docs = loader.load()
+    for inum, text in enumerate(docs):
+        print(inum)
+        chunks = text_splitter.split_text(text.page_content)
+        for idx, chunk in enumerate(chunks):
+            print(f'///////////////////////// idx:{idx} //////////////////////////')
+            print(len(chunk))
--- a/langchain-chat/text_splitter/chinese_recursive_text_splitter.py
+++ b/langchain-chat/text_splitter/chinese_recursive_text_splitter.py
@@ -0,0 +1,127 @@
+import re
+from typing import List, Optional, Any
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def _split_text_with_regex_from_end(
+        text: str, separator: str, keep_separator: bool
+) -> List[str]:
+    # Now that we have the separator, split the text
+    if separator:
+        if keep_separator:
+            # The parentheses in the pattern keep the delimiters in the result.
+            _splits = re.split(f"({separator})", text)
+            splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+            if len(_splits) % 2 == 1:
+                splits += _splits[-1:]
+            # splits = [_splits[0]] + splits
+        else:
+            splits = re.split(separator, text)
+    else:
+        splits = list(text)
+    return [s for s in splits if s != ""]
+
+
+class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
+    def __init__(
+            self,
+            separators: Optional[List[str]] = None,
+            keep_separator: bool = True,
+            is_separator_regex: bool = True,
+            **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            "。|！|？",
+            "\.\s|\!\s|\?\s",
+            "；|;\s",
+            "，|,\s"
+        ]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1:]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self._keep_separator else separator
+        for s in splits:
+            if self._length_function(s) < self._chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
+
+
+if __name__ == "__main__":
+    text_splitter = ChineseRecursiveTextSplitter(
+        keep_separator=True,
+        is_separator_regex=True,
+        chunk_size=100,
+        chunk_overlap=0
+    )
+    ls = [
+        """中国对外贸易形势报告（75页）。
+        前 10 个月，一般贸易进出口 19.5 万亿元，增长 25.1%， 比整体进出口增速高出 2.9 个百分点，
+        占进出口总额的 61.7%，较去年同期提升 1.6 个百分点。其中，一般贸易出口 10.6 万亿元，增长 25.3%，
+        占出口总额的 60.9%，提升 1.5 个百分点；进口8.9万亿元，增长24.9%，占进口总额的62.7%， 提升 1.8 个百分点。
+        加工贸易进出口 6.8 万亿元，增长 11.8%， 占进出口总额的 21.5%，减少 2.0 个百分点。其中，出口增 长 10.4%，
+        占出口总额的 24.3%，减少 2.6 个百分点；进口增 长 14.2%，占进口总额的 18.0%，减少 1.2 个百分点。
+        此外， 以保税物流方式进出口 3.96 万亿元，增长 27.9%。
+        其中，出 口 1.47 万亿元，增长 38.9%；进口 2.49 万亿元，增长 22.2%。
+        前三季度，中国服务贸易继续保持快速增长态势。服务 进出口总额 37834.3 亿元，增长 11.6%；
+        其中服务出口 17820.9 亿元，增长 27.3%；进口 20013.4 亿元，增长 0.5%，进口增 速实现了疫情以来的首次转正。
+        服务出口增幅大于进口 26.8 个百分点，带动服务贸易逆差下降 62.9%至 2192.5 亿元。
+        """,
+        """
+        服 务贸易结构持续优化，知识密集型服务进出口 16917.7 亿元， 增长 13.3%，占服务进出口总额的比重达到 44.7%，
+        提升 0.7 个百分点。 二、中国对外贸易发展环境分析和展望 全球疫情起伏反复，经济复苏分化加剧，
+        大宗商品价格 上涨、能源紧缺、运力紧张及发达经济体政策调整外溢等风 险交织叠加。同时也要看到，
+        我国经济长期向好的趋势没有 改变，外贸企业韧性和活力不断增强，新业态新模式加快发 展，创新转型步伐提速。
+        产业链供应链面临挑战。美欧等加快出台制造业回迁计 划，加速产业链供应链本土布局，跨国公司调整产业链供应 链，
+        全球双链面临新一轮重构，区域化、近岸化、本土化、 短链化趋势凸显。疫苗供应不足，制造业“缺芯”、
+        物流受限、 运价高企，全球产业链供应链面临压力。 全球通胀持续高位运行。能源价格上涨加大主要经济体 的通胀压力，
+        增加全球经济复苏的不确定性。世界银行今年 10 月发布《大宗商品市场展望》指出，能源价格在 2021 年 大涨逾 80%，
+        并且仍将在 2022 年小幅上涨。IMF 指出，全 球通胀上行风险加剧，通胀前景存在巨大不确定性。
+        """
+        ]
+    # text = """"""
+    for inum, text in enumerate(ls):
+        print(inum)
+        chunks = text_splitter.split_text(text)
+        for idx, chunk in enumerate(chunks):
+            print(f'!!!!!!!!!!!!!!!!!!!!!idx:{idx}')
+            print(chunk)
--- a/langchain-chat/text_splitter/chinese_text_splitter.py
+++ b/langchain-chat/text_splitter/chinese_text_splitter.py
@@ -0,0 +1,59 @@
+from langchain.text_splitter import CharacterTextSplitter
+import re
+from typing import List
+
+
+class ChineseTextSplitter(CharacterTextSplitter):
+    def __init__(self, pdf: bool = False, sentence_size: int = 250, **kwargs):
+        super().__init__(**kwargs)
+        self.pdf = pdf
+        self.sentence_size = sentence_size
+
+    def split_text1(self, text: str) -> List[str]:
+        if self.pdf:
+            text = re.sub(r"\n{3,}", "\n", text)
+            text = re.sub('\s', ' ', text)
+            text = text.replace("\n\n", "")
+        sent_sep_pattern = re.compile('([﹒﹔﹖﹗．。！？]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')  # del ：；
+        sent_list = []
+        for ele in sent_sep_pattern.split(text):
+            if sent_sep_pattern.match(ele) and sent_list:
+                sent_list[-1] += ele
+            elif ele:
+                sent_list.append(ele)
+        return sent_list
+
+    def split_text(self, text: str) -> List[str]:   ##此处需要进一步优化逻辑
+        if self.pdf:
+            text = re.sub(r"\n{3,}", r"\n", text)
+            text = re.sub('\s', " ", text)
+            text = re.sub("\n\n", "", text)
+
+        text = re.sub(r'([;；.!?。！？\?])([^”’])', r"\1\n\2", text)  # 单字符断句符
+        text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)  # 英文省略号
+        text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)  # 中文省略号
+        text = re.sub(r'([;；!?。！？\?]["’”」』]{0,2})([^;；!?，。！？\?])', r'\1\n\2', text)
+        # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
+        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
+        # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
+        ls = [i for i in text.split("\n") if i]
+        for ele in ls:
+            if len(ele) > self.sentence_size:
+                ele1 = re.sub(r'([,，.]["’”」』]{0,2})([^,，.])', r'\1\n\2', ele)
+                ele1_ls = ele1.split("\n")
+                for ele_ele1 in ele1_ls:
+                    if len(ele_ele1) > self.sentence_size:
+                        ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
+                        ele2_ls = ele_ele2.split("\n")
+                        for ele_ele2 in ele2_ls:
+                            if len(ele_ele2) > self.sentence_size:
+                                ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
+                                ele2_id = ele2_ls.index(ele_ele2)
+                                ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
+                                                                                                       ele2_id + 1:]
+                        ele_id = ele1_ls.index(ele_ele1)
+                        ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
+
+                id = ls.index(ele)
+                ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
+        return ls
--- a/langchain-chat/text_splitter/zh_title_enhance.py
+++ b/langchain-chat/text_splitter/zh_title_enhance.py
@@ -0,0 +1,99 @@
+from langchain.docstore.document import Document
+import re
+
+
+def under_non_alpha_ratio(text: str, threshold: float = 0.5):
+    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
+    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
+    as a title or narrative text. The ratio does not count spaces.
+
+    Parameters
+    ----------
+    text
+        The input string to test
+    threshold
+        If the proportion of non-alpha characters exceeds this threshold, the function
+        returns False
+    """
+    if len(text) == 0:
+        return False
+
+    alpha_count = len([char for char in text if char.strip() and char.isalpha()])
+    total_count = len([char for char in text if char.strip()])
+    try:
+        ratio = alpha_count / total_count
+        return ratio < threshold
+    except:
+        return False
+
+
+def is_possible_title(
+        text: str,
+        title_max_word_length: int = 20,
+        non_alpha_threshold: float = 0.5,
+) -> bool:
+    """Checks to see if the text passes all of the checks for a valid title.
+
+    Parameters
+    ----------
+    text
+        The input text to check
+    title_max_word_length
+        The maximum number of words a title can contain
+    non_alpha_threshold
+        The minimum number of alpha characters the text needs to be considered a title
+    """
+
+    # 文本长度为0的话，肯定不是title
+    if len(text) == 0:
+        print("Not a title. Text is empty.")
+        return False
+
+    # 文本中有标点符号，就不是title
+    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
+    if ENDS_IN_PUNCT_RE.search(text) is not None:
+        return False
+
+    # 文本长度不能超过设定值，默认20
+    # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
+    # is less expensive and actual tokenization doesn't add much value for the length check
+    if len(text) > title_max_word_length:
+        return False
+
+    # 文本中数字的占比不能太高，否则不是title
+    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
+        return False
+
+    # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
+    if text.endswith((",", ".", "，", "。")):
+        return False
+
+    if text.isnumeric():
+        print(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
+        return False
+
+    # 开头的字符内应该有数字，默认5个字符内
+    if len(text) < 5:
+        text_5 = text
+    else:
+        text_5 = text[:5]
+    alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
+    if not alpha_in_text_5:
+        return False
+
+    return True
+
+
+def zh_title_enhance(docs: Document) -> Document:
+    title = None
+    if len(docs) > 0:
+        for doc in docs:
+            if is_possible_title(doc.page_content):
+                doc.metadata['category'] = 'cn_Title'
+                title = doc.page_content
+            elif title:
+                doc.page_content = f"下文与({title})有关。{doc.page_content}"
+        return docs
+    else:
+        print("文件不存在")