76 lines
3.1 KiB
Python
76 lines
3.1 KiB
Python
|
|
import multiprocessing
|
|||
|
|
import re
|
|||
|
|
import time
|
|||
|
|
import networkx as nx
|
|||
|
|
import numpy as np
|
|||
|
|
from textrank4zh import TextRank4Keyword, TextRank4Sentence
|
|||
|
|
from joblib import Parallel, delayed, parallel_backend
|
|||
|
|
import logging
|
|||
|
|
nx.from_numpy_matrix = nx.from_numpy_array
|
|||
|
|
|
|||
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_text_segment(text_segment, num_sentences):
|
|||
|
|
tr4w = TextRank4Keyword()
|
|||
|
|
tr4w.analyze(text=text_segment, lower=True, window=5)
|
|||
|
|
keywords = [(item.word, item.weight) for item in tr4w.get_keywords(30, word_min_len=4)]
|
|||
|
|
|
|||
|
|
tr4s = TextRank4Sentence()
|
|||
|
|
tr4s.analyze(text=text_segment, lower=True, source='all_filters')
|
|||
|
|
summaries = [item.sentence for item in tr4s.get_key_sentences(num=num_sentences)]
|
|||
|
|
|
|||
|
|
return keywords, summaries
|
|||
|
|
|
|||
|
|
|
|||
|
|
def split_text_by_sentences(text, n_parts):
|
|||
|
|
"""Split the text into n_parts based on sentences using regular expressions."""
|
|||
|
|
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
|
|||
|
|
k, m = divmod(len(sentences), n_parts)
|
|||
|
|
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
|
|||
|
|
from nltk.tokenize import sent_tokenize
|
|||
|
|
def split_text_balanced(text, n_parts):
|
|||
|
|
sentences = sent_tokenize(text)
|
|||
|
|
min_sentences_per_part = 10
|
|||
|
|
n_parts = max(1, min(n_parts, len(sentences) // min_sentences_per_part))
|
|||
|
|
k, m = divmod(len(sentences), n_parts)
|
|||
|
|
return [' '.join(sentences[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n_parts)]
|
|||
|
|
|
|||
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|||
|
|
def TextRank(text,num_sentences, n_cores=multiprocessing.cpu_count()):
|
|||
|
|
start_time = time.time()
|
|||
|
|
logging.info("TextRank 函数开始执行")
|
|||
|
|
|
|||
|
|
# text_parts = split_text_by_sentences(text, n_cores)
|
|||
|
|
text_parts = split_text_balanced(text, n_cores)
|
|||
|
|
all_keywords = []
|
|||
|
|
all_summaries = []
|
|||
|
|
|
|||
|
|
# with ProcessPoolExecutor (max_workers=n_cores) as executor:
|
|||
|
|
# future_to_part = {executor.submit(process_text_segment, part, num_sentences): part for part in text_parts}
|
|||
|
|
# for future in as_completed(future_to_part):
|
|||
|
|
# keywords, summaries = future.result()
|
|||
|
|
# all_keywords.extend(keywords)
|
|||
|
|
# all_summaries.extend(summaries)
|
|||
|
|
for part in text_parts:
|
|||
|
|
keywords, summaries = process_text_segment(part, num_sentences)
|
|||
|
|
all_keywords.extend(keywords)
|
|||
|
|
all_summaries.extend(summaries)
|
|||
|
|
for word, weight in sorted(all_keywords, key=lambda x: x[1], reverse=True):
|
|||
|
|
print(word, weight)
|
|||
|
|
|
|||
|
|
all_summaries = "".join(all_summaries)
|
|||
|
|
|
|||
|
|
end_time = time.time()
|
|||
|
|
elapsed_time = end_time - start_time
|
|||
|
|
logging.info(f"TextRank 函数执行结束,耗时: {elapsed_time:.2f} 秒")
|
|||
|
|
|
|||
|
|
return all_summaries
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 传入必要的参数
|
|||
|
|
num_sentences = 80
|
|||
|
|
text = """中华人民共和国国民经济和社会发展第十四个五年(2021-2025年)规划和2035年远景目标纲要"""
|
|||
|
|
summary = TextRank(text, num_sentences)
|
|||
|
|
print(f"原文长度{len(text)},压缩文本后长度 {len(summary)}")
|