[全量] 初始化项目代码、配置、文档及Agent协同harness
This commit is contained in:
99
langchain-chat/text_splitter/zh_title_enhance.py
Normal file
99
langchain-chat/text_splitter/zh_title_enhance.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from langchain.docstore.document import Document
|
||||
import re
|
||||
|
||||
|
||||
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
|
||||
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
|
||||
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
|
||||
as a title or narrative text. The ratio does not count spaces.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The input string to test
|
||||
threshold
|
||||
If the proportion of non-alpha characters exceeds this threshold, the function
|
||||
returns False
|
||||
"""
|
||||
if len(text) == 0:
|
||||
return False
|
||||
|
||||
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
|
||||
total_count = len([char for char in text if char.strip()])
|
||||
try:
|
||||
ratio = alpha_count / total_count
|
||||
return ratio < threshold
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def is_possible_title(
|
||||
text: str,
|
||||
title_max_word_length: int = 20,
|
||||
non_alpha_threshold: float = 0.5,
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The input text to check
|
||||
title_max_word_length
|
||||
The maximum number of words a title can contain
|
||||
non_alpha_threshold
|
||||
The minimum number of alpha characters the text needs to be considered a title
|
||||
"""
|
||||
|
||||
# 文本长度为0的话,肯定不是title
|
||||
if len(text) == 0:
|
||||
print("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
# 文本中有标点符号,就不是title
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
if ENDS_IN_PUNCT_RE.search(text) is not None:
|
||||
return False
|
||||
|
||||
# 文本长度不能超过设定值,默认20
|
||||
# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
|
||||
# is less expensive and actual tokenization doesn't add much value for the length check
|
||||
if len(text) > title_max_word_length:
|
||||
return False
|
||||
|
||||
# 文本中数字的占比不能太高,否则不是title
|
||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||
return False
|
||||
|
||||
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
|
||||
if text.endswith((",", ".", ",", "。")):
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
|
||||
return False
|
||||
|
||||
# 开头的字符内应该有数字,默认5个字符内
|
||||
if len(text) < 5:
|
||||
text_5 = text
|
||||
else:
|
||||
text_5 = text[:5]
|
||||
alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
|
||||
if not alpha_in_text_5:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def zh_title_enhance(docs: Document) -> Document:
|
||||
title = None
|
||||
if len(docs) > 0:
|
||||
for doc in docs:
|
||||
if is_possible_title(doc.page_content):
|
||||
doc.metadata['category'] = 'cn_Title'
|
||||
title = doc.page_content
|
||||
elif title:
|
||||
doc.page_content = f"下文与({title})有关。{doc.page_content}"
|
||||
return docs
|
||||
else:
|
||||
print("文件不存在")
|
||||
Reference in New Issue
Block a user