[前端+RAG] 添加PDF阅读模式切换(预览/笔记);改进PDF→HTML用get_text(html)保留格式;修复长文件名操作菜单不显示
This commit is contained in:
@@ -916,7 +916,7 @@ class FileConverter:
|
||||
return "".join(parts) if parts else '<p><em>(本页无文本内容)</em></p>'
|
||||
|
||||
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
|
||||
"""PDF 预览:本进程内 PyMuPDF 抽文本生成 HTML,不调用外部 /convert 微服务。"""
|
||||
"""PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。"""
|
||||
allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
|
||||
abs_input = os.path.abspath(input_path)
|
||||
if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
|
||||
@@ -935,27 +935,32 @@ class FileConverter:
|
||||
with fitz.open(abs_input) as doc:
|
||||
for i in range(len(doc)):
|
||||
page = doc.load_page(i)
|
||||
raw = (page.get_text() or "").strip()
|
||||
if raw:
|
||||
# 使用 get_text("html") 保留格式和图片(base64内嵌)
|
||||
page_html = (page.get_text("html") or "").strip()
|
||||
if page_html:
|
||||
any_text = True
|
||||
inner = self._pdf_plain_text_to_html(raw)
|
||||
sections.append(
|
||||
f'<section class="pdf-page" data-page="{i + 1}" '
|
||||
'style="margin-bottom:1.5em;padding-bottom:1em;border-bottom:1px solid #e5e5e5;">'
|
||||
f'<div style="font-size:12px;color:#888;margin-bottom:8px;">第 {i + 1} 页</div>'
|
||||
f"{inner}</section>"
|
||||
f"{page_html}</section>"
|
||||
)
|
||||
|
||||
css = '''<style>
|
||||
.pdf-preview { font-family: system-ui, -apple-system, sans-serif; line-height: 1.6; max-width: 100%; }
|
||||
.pdf-preview img { max-width: 100%; height: auto; }
|
||||
.pdf-preview p { margin: 0.3em 0; }
|
||||
.pdf-preview span { line-height: 1.5; }
|
||||
</style>'''
|
||||
|
||||
if not any_text:
|
||||
wrapper = (
|
||||
'<div class="pdf-preview" style="font-family:system-ui,-apple-system,Segoe UI,Roboto,sans-serif;'
|
||||
'line-height:1.6;max-width:900px;">'
|
||||
f'{css}<div class="pdf-preview">'
|
||||
"<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>"
|
||||
)
|
||||
else:
|
||||
wrapper = (
|
||||
'<div class="pdf-preview" style="font-family:system-ui,-apple-system,Segoe UI,Roboto,sans-serif;'
|
||||
'line-height:1.6;max-width:900px;">'
|
||||
f'{css}<div class="pdf-preview">'
|
||||
f"{''.join(sections)}</div>"
|
||||
)
|
||||
return self._save_html(f"<body>{wrapper}</body>", output_path)
|
||||
|
||||
Reference in New Issue
Block a user