[RAG] PDF阅读模式改用pdfplumber(文本+表格提取,干净HTML);修复ck_mysql_config导入

This commit is contained in:
2026-04-02 16:18:59 +08:00
parent 05e33d1d05
commit 46428b7936
3 changed files with 72 additions and 26 deletions

View File

@@ -907,13 +907,6 @@ onMounted(async () => {
flex: 1; overflow: auto; position: relative; padding: 0;
.view-md {
padding: 20px;
// 覆盖 PyMuPDF get_text("html") 输出的固定宽度
:deep(div) { max-width: 100% !important; }
:deep(div[style*="width:"]) { width: auto !important; max-width: 100% !important; }
:deep(.pdf-page) { max-width: 100% !important; }
:deep(.pdf-page > div) { width: auto !important; max-width: 100% !important; }
:deep(.pdf-preview) { max-width: 100% !important; }
:deep(section) { max-width: 100% !important; }
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
:deep(.highlight) { background: #D0EAC8; }
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }

View File

@@ -915,8 +915,13 @@ class FileConverter:
parts.append("<br/>")
return "".join(parts) if parts else '<p><em>(本页无文本内容)</em></p>'
@staticmethod
def _escape_html(text: str) -> str:
"""HTML 转义"""
return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
"""PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片"""
"""PDF 预览:使用 pdfplumber 提取文本和表格,生成干净的 HTML"""
allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
abs_input = os.path.abspath(input_path)
if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
@@ -930,35 +935,82 @@ class FileConverter:
return "不是 PDF 文件"
try:
import pdfplumber
import re
sections: list[str] = []
any_text = False
with fitz.open(abs_input) as doc:
for i in range(len(doc)):
page = doc.load_page(i)
# 使用 get_text("html") 保留格式和图片base64内嵌
page_html = (page.get_text("html") or "").strip()
if page_html:
with pdfplumber.open(abs_input) as pdf:
for i, page in enumerate(pdf.pages):
page_parts: list[str] = []
# 提取表格
tables = page.extract_tables()
table_bboxes = []
if tables:
for tbl_settings in page.find_tables():
table_bboxes.append(tbl_settings.bbox)
# 提取文本(排除表格区域的文本)
text = page.extract_text() or ""
if text.strip():
any_text = True
# 按行处理文本,识别标题
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# 简单的标题检测:短行 + 无标点结尾
is_heading = (len(line) < 40 and not line.endswith(('', '', '', '', '', ',', '.', ';'))
and not line.startswith(('', '('))
and re.match(r'^[一二三四五六七八九十\d]+[、.]', line))
if is_heading:
escaped = self._escape_html(line)
page_parts.append(f'<h3>{escaped}</h3>')
else:
escaped = self._escape_html(line)
page_parts.append(f'<p>{escaped}</p>')
# 渲染表格
for table in tables:
if not table:
continue
page_parts.append('<table class="pdf-table">')
for row_idx, row in enumerate(table):
page_parts.append('<tr>')
tag = 'th' if row_idx == 0 else 'td'
for cell in row:
cell_text = self._escape_html(str(cell)) if cell is not None else ''
page_parts.append(f'<{tag}>{cell_text}</{tag}>')
page_parts.append('</tr>')
page_parts.append('</table>')
page_html = '\n'.join(page_parts)
sections.append(
f'<section class="pdf-page" data-page="{i + 1}" '
'style="margin-bottom:1.5em;padding-bottom:1em;border-bottom:1px solid #e5e5e5;">'
f'<div style="font-size:12px;color:#888;margin-bottom:8px;">第 {i + 1} 页</div>'
f"{page_html}</section>"
f'<section class="pdf-page" data-page="{i + 1}">'
f'<div class="pdf-page-num">第 {i + 1} 页</div>'
f'{page_html}</section>'
)
css = '''<style>
.pdf-preview { font-family: system-ui, -apple-system, sans-serif; line-height: 1.6; max-width: 100%; }
.pdf-preview > div { width: 100% !important; max-width: 100% !important; }
.pdf-preview img { max-width: 100%; height: auto; }
.pdf-preview p { margin: 0.3em 0; }
.pdf-preview span { line-height: 1.5; }
.pdf-page > div { width: 100% !important; max-width: 100% !important; }
.pdf-preview { font-family: "PingFang SC", "Microsoft YaHei", system-ui, sans-serif; line-height: 1.8; color: #333; }
.pdf-page { margin-bottom: 1.5em; padding-bottom: 1em; border-bottom: 1px solid #e5e5e5; }
.pdf-page-num { font-size: 12px; color: #999; margin-bottom: 8px; }
.pdf-preview p { margin: 0.3em 0; font-size: 15px; text-indent: 0; }
.pdf-preview h3 { font-size: 16px; font-weight: bold; margin: 1em 0 0.4em; color: #222; }
.pdf-table { border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 14px; }
.pdf-table th, .pdf-table td { border: 1px solid #d0d0d0; padding: 6px 10px; text-align: left; vertical-align: top; }
.pdf-table th { background: #f5f7fa; font-weight: bold; }
.pdf-table tr:nth-child(even) { background: #fafbff; }
</style>'''
if not any_text:
wrapper = (
f'{css}<div class="pdf-preview">'
"<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>"
'<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>'
)
else:
wrapper = (

View File

@@ -7,6 +7,7 @@ from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
EXPR,
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
logger, log_verbose, POLICY_KNOWLEDGE_BASE)
from configs.kb_config import ck_mysql_config
from configs.model_config import LLM_MODELS
from server.knowledge_base.cleanpdf import PdfConverter
from server.knowledge_base.file_converter import FileConverter