[前端+RAG] 异步上传+前端轮询自动刷新导读;PDF阅读模式合并行消除留白

This commit is contained in:
2026-04-02 17:17:36 +08:00
parent ee7c4a73ed
commit 5dcb8771ed
4 changed files with 182 additions and 56 deletions

View File

@@ -957,22 +957,33 @@ class FileConverter:
if text.strip():
any_text = True
# 按行处理文本,识别标题
lines = text.split('\n')
# 合并连续非空行为段落,空行分段,标题行独立
current_para = []
for line in lines:
line = line.strip()
if not line:
stripped = line.strip()
if not stripped:
# 空行 → 结束当前段落
if current_para:
page_parts.append(f'<p>{self._escape_html("".join(current_para))}</p>')
current_para = []
continue
# 简单的标题检测:短行 + 无标点结尾
is_heading = (len(line) < 40 and not line.endswith(('', '', '', '', '', ',', '.', ';'))
and not line.startswith(('', '('))
and re.match(r'^[一二三四五六七八九十\d]+[、.]', line))
# 标题检测
is_heading = (len(stripped) < 30
and not stripped.endswith(('', '', '', '', '', ',', '.', ';'))
and not stripped.startswith(('', '('))
and re.match(r'^[一二三四五六七八九十\d]+[、.]', stripped))
if is_heading:
escaped = self._escape_html(line)
page_parts.append(f'<h3>{escaped}</h3>')
# 先输出累积的段落
if current_para:
page_parts.append(f'<p>{self._escape_html("".join(current_para))}</p>')
current_para = []
page_parts.append(f'<h3>{self._escape_html(stripped)}</h3>')
else:
escaped = self._escape_html(line)
page_parts.append(f'<p>{escaped}</p>')
current_para.append(stripped)
# 输出最后一个段落
if current_para:
page_parts.append(f'<p>{self._escape_html("".join(current_para))}</p>')
# 渲染表格
for table in tables: