[前端+RAG] 添加PDF阅读模式切换(预览/笔记);改进PDF→HTML用get_text(html)保留格式;修复长文件名操作菜单不显示

This commit is contained in:
2026-04-02 14:21:41 +08:00
parent 5158753b94
commit 379187f675
2 changed files with 52 additions and 14 deletions

View File

@@ -916,7 +916,7 @@ class FileConverter:
return "".join(parts) if parts else '<p><em>(本页无文本内容)</em></p>'
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
"""PDF 预览:本进程内 PyMuPDF 抽文本生成 HTML不调用外部 /convert 微服务"""
"""PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片"""
allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
abs_input = os.path.abspath(input_path)
if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
@@ -935,27 +935,32 @@ class FileConverter:
with fitz.open(abs_input) as doc:
for i in range(len(doc)):
page = doc.load_page(i)
raw = (page.get_text() or "").strip()
if raw:
# 使用 get_text("html") 保留格式和图片base64内嵌
page_html = (page.get_text("html") or "").strip()
if page_html:
any_text = True
inner = self._pdf_plain_text_to_html(raw)
sections.append(
f'<section class="pdf-page" data-page="{i + 1}" '
'style="margin-bottom:1.5em;padding-bottom:1em;border-bottom:1px solid #e5e5e5;">'
f'<div style="font-size:12px;color:#888;margin-bottom:8px;">第 {i + 1} 页</div>'
f"{inner}</section>"
f"{page_html}</section>"
)
css = '''<style>
.pdf-preview { font-family: system-ui, -apple-system, sans-serif; line-height: 1.6; max-width: 100%; }
.pdf-preview img { max-width: 100%; height: auto; }
.pdf-preview p { margin: 0.3em 0; }
.pdf-preview span { line-height: 1.5; }
</style>'''
if not any_text:
wrapper = (
'<div class="pdf-preview" style="font-family:system-ui,-apple-system,Segoe UI,Roboto,sans-serif;'
'line-height:1.6;max-width:900px;">'
f'{css}<div class="pdf-preview">'
"<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>"
)
else:
wrapper = (
'<div class="pdf-preview" style="font-family:system-ui,-apple-system,Segoe UI,Roboto,sans-serif;'
'line-height:1.6;max-width:900px;">'
f'{css}<div class="pdf-preview">'
f"{''.join(sections)}</div>"
)
return self._save_html(f"<body>{wrapper}</body>", output_path)