[全量] 初始化项目代码、配置、文档及Agent协同harness
This commit is contained in:
76
langchain-chat/server/knowledge_base/TexkRank.py
Normal file
76
langchain-chat/server/knowledge_base/TexkRank.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import multiprocessing
|
||||
import re
|
||||
import time
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
from textrank4zh import TextRank4Keyword, TextRank4Sentence
|
||||
from joblib import Parallel, delayed, parallel_backend
|
||||
import logging
|
||||
nx.from_numpy_matrix = nx.from_numpy_array
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
def process_text_segment(text_segment, num_sentences):
|
||||
tr4w = TextRank4Keyword()
|
||||
tr4w.analyze(text=text_segment, lower=True, window=5)
|
||||
keywords = [(item.word, item.weight) for item in tr4w.get_keywords(30, word_min_len=4)]
|
||||
|
||||
tr4s = TextRank4Sentence()
|
||||
tr4s.analyze(text=text_segment, lower=True, source='all_filters')
|
||||
summaries = [item.sentence for item in tr4s.get_key_sentences(num=num_sentences)]
|
||||
|
||||
return keywords, summaries
|
||||
|
||||
|
||||
def split_text_by_sentences(text, n_parts):
|
||||
"""Split the text into n_parts based on sentences using regular expressions."""
|
||||
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
||||
k, m = divmod(len(sentences), n_parts)
|
||||
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
|
||||
from nltk.tokenize import sent_tokenize
|
||||
def split_text_balanced(text, n_parts):
|
||||
sentences = sent_tokenize(text)
|
||||
min_sentences_per_part = 10
|
||||
n_parts = max(1, min(n_parts, len(sentences) // min_sentences_per_part))
|
||||
k, m = divmod(len(sentences), n_parts)
|
||||
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
|
||||
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
def TextRank(text,num_sentences, n_cores=multiprocessing.cpu_count()):
|
||||
start_time = time.time()
|
||||
logging.info("TextRank 函数开始执行")
|
||||
|
||||
# text_parts = split_text_by_sentences(text, n_cores)
|
||||
text_parts = split_text_balanced(text, n_cores)
|
||||
all_keywords = []
|
||||
all_summaries = []
|
||||
|
||||
# with ProcessPoolExecutor (max_workers=n_cores) as executor:
|
||||
# future_to_part = {executor.submit(process_text_segment, part, num_sentences): part for part in text_parts}
|
||||
# for future in as_completed(future_to_part):
|
||||
# keywords, summaries = future.result()
|
||||
# all_keywords.extend(keywords)
|
||||
# all_summaries.extend(summaries)
|
||||
for part in text_parts:
|
||||
keywords, summaries = process_text_segment(part, num_sentences)
|
||||
all_keywords.extend(keywords)
|
||||
all_summaries.extend(summaries)
|
||||
for word, weight in sorted(all_keywords, key=lambda x: x[1], reverse=True):
|
||||
print(word, weight)
|
||||
|
||||
all_summaries = "".join(all_summaries)
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
logging.info(f"TextRank 函数执行结束,耗时: {elapsed_time:.2f} 秒")
|
||||
|
||||
return all_summaries
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 传入必要的参数
|
||||
num_sentences = 80
|
||||
text = """中华人民共和国国民经济和社会发展第十四个五年(2021-2025年)规划和2035年远景目标纲要"""
|
||||
summary = TextRank(text, num_sentences)
|
||||
print(f"原文长度{len(text)},压缩文本后长度 {len(summary)}")
|
||||
Reference in New Issue
Block a user