Files
gangyan/langchain-chat/server/knowledge_base/TexkRank.py

76 lines
3.1 KiB
Python
Raw Normal View History

import multiprocessing
import re
import time
import networkx as nx
import numpy as np
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from joblib import Parallel, delayed, parallel_backend
import logging
nx.from_numpy_matrix = nx.from_numpy_array
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_text_segment(text_segment, num_sentences):
tr4w = TextRank4Keyword()
tr4w.analyze(text=text_segment, lower=True, window=5)
keywords = [(item.word, item.weight) for item in tr4w.get_keywords(30, word_min_len=4)]
tr4s = TextRank4Sentence()
tr4s.analyze(text=text_segment, lower=True, source='all_filters')
summaries = [item.sentence for item in tr4s.get_key_sentences(num=num_sentences)]
return keywords, summaries
def split_text_by_sentences(text, n_parts):
"""Split the text into n_parts based on sentences using regular expressions."""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
k, m = divmod(len(sentences), n_parts)
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
from nltk.tokenize import sent_tokenize
def split_text_balanced(text, n_parts):
sentences = sent_tokenize(text)
min_sentences_per_part = 10
n_parts = max(1, min(n_parts, len(sentences) // min_sentences_per_part))
k, m = divmod(len(sentences), n_parts)
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
from concurrent.futures import ProcessPoolExecutor, as_completed
def TextRank(text,num_sentences, n_cores=multiprocessing.cpu_count()):
start_time = time.time()
logging.info("TextRank 函数开始执行")
# text_parts = split_text_by_sentences(text, n_cores)
text_parts = split_text_balanced(text, n_cores)
all_keywords = []
all_summaries = []
# with ProcessPoolExecutor (max_workers=n_cores) as executor:
# future_to_part = {executor.submit(process_text_segment, part, num_sentences): part for part in text_parts}
# for future in as_completed(future_to_part):
# keywords, summaries = future.result()
# all_keywords.extend(keywords)
# all_summaries.extend(summaries)
for part in text_parts:
keywords, summaries = process_text_segment(part, num_sentences)
all_keywords.extend(keywords)
all_summaries.extend(summaries)
for word, weight in sorted(all_keywords, key=lambda x: x[1], reverse=True):
print(word, weight)
all_summaries = "".join(all_summaries)
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"TextRank 函数执行结束,耗时: {elapsed_time:.2f}")
return all_summaries
if __name__ == '__main__':
# 传入必要的参数
num_sentences = 80
text = """中华人民共和国国民经济和社会发展第十四个五年20212025年规划和2035年远景目标纲要"""
summary = TextRank(text, num_sentences)
print(f"原文长度{len(text)},压缩文本后长度 {len(summary)}")