[RAG] PDF阅读模式改用pdfplumber(文本+表格提取,干净HTML);修复ck_mysql_config导入
This commit is contained in:
@@ -907,13 +907,6 @@ onMounted(async () => {
|
|||||||
flex: 1; overflow: auto; position: relative; padding: 0;
|
flex: 1; overflow: auto; position: relative; padding: 0;
|
||||||
.view-md {
|
.view-md {
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
// 覆盖 PyMuPDF get_text("html") 输出的固定宽度
|
|
||||||
:deep(div) { max-width: 100% !important; }
|
|
||||||
:deep(div[style*="width:"]) { width: auto !important; max-width: 100% !important; }
|
|
||||||
:deep(.pdf-page) { max-width: 100% !important; }
|
|
||||||
:deep(.pdf-page > div) { width: auto !important; max-width: 100% !important; }
|
|
||||||
:deep(.pdf-preview) { max-width: 100% !important; }
|
|
||||||
:deep(section) { max-width: 100% !important; }
|
|
||||||
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
|
:deep(p) { font-size: 15px; line-height: 1.8rem; margin-block-start: 0; }
|
||||||
:deep(.highlight) { background: #D0EAC8; }
|
:deep(.highlight) { background: #D0EAC8; }
|
||||||
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
|
:deep(.note-flag) { width: 23px; height: 28px; line-height: 28px; display: inline-block; text-align: center; font-weight: bold; font-size: 10px; margin-left: 8px; cursor: pointer; background: url("@/assets/images/reading/note.png"); color: #004EA0; background-size: contain !important; background-repeat: no-repeat !important; background-position: center bottom !important; }
|
||||||
|
|||||||
@@ -915,8 +915,13 @@ class FileConverter:
|
|||||||
parts.append("<br/>")
|
parts.append("<br/>")
|
||||||
return "".join(parts) if parts else '<p><em>(本页无文本内容)</em></p>'
|
return "".join(parts) if parts else '<p><em>(本页无文本内容)</em></p>'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _escape_html(text: str) -> str:
|
||||||
|
"""HTML 转义"""
|
||||||
|
return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
|
|
||||||
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
|
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
|
||||||
"""PDF 预览:使用 PyMuPDF 的 get_text("html") 保留格式、字体、图片。"""
|
"""PDF 预览:使用 pdfplumber 提取文本和表格,生成干净的 HTML。"""
|
||||||
allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
|
allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT)
|
||||||
abs_input = os.path.abspath(input_path)
|
abs_input = os.path.abspath(input_path)
|
||||||
if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
|
if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep):
|
||||||
@@ -930,35 +935,82 @@ class FileConverter:
|
|||||||
return "不是 PDF 文件"
|
return "不是 PDF 文件"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
import re
|
||||||
|
|
||||||
sections: list[str] = []
|
sections: list[str] = []
|
||||||
any_text = False
|
any_text = False
|
||||||
with fitz.open(abs_input) as doc:
|
|
||||||
for i in range(len(doc)):
|
with pdfplumber.open(abs_input) as pdf:
|
||||||
page = doc.load_page(i)
|
for i, page in enumerate(pdf.pages):
|
||||||
# 使用 get_text("html") 保留格式和图片(base64内嵌)
|
page_parts: list[str] = []
|
||||||
page_html = (page.get_text("html") or "").strip()
|
|
||||||
if page_html:
|
# 提取表格
|
||||||
|
tables = page.extract_tables()
|
||||||
|
table_bboxes = []
|
||||||
|
if tables:
|
||||||
|
for tbl_settings in page.find_tables():
|
||||||
|
table_bboxes.append(tbl_settings.bbox)
|
||||||
|
|
||||||
|
# 提取文本(排除表格区域的文本)
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
|
||||||
|
if text.strip():
|
||||||
any_text = True
|
any_text = True
|
||||||
|
# 按行处理文本,识别标题
|
||||||
|
lines = text.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
# 简单的标题检测:短行 + 无标点结尾
|
||||||
|
is_heading = (len(line) < 40 and not line.endswith(('。', ',', ';', '、', ':', ',', '.', ';'))
|
||||||
|
and not line.startswith(('(', '('))
|
||||||
|
and re.match(r'^[一二三四五六七八九十\d]+[、..]', line))
|
||||||
|
if is_heading:
|
||||||
|
escaped = self._escape_html(line)
|
||||||
|
page_parts.append(f'<h3>{escaped}</h3>')
|
||||||
|
else:
|
||||||
|
escaped = self._escape_html(line)
|
||||||
|
page_parts.append(f'<p>{escaped}</p>')
|
||||||
|
|
||||||
|
# 渲染表格
|
||||||
|
for table in tables:
|
||||||
|
if not table:
|
||||||
|
continue
|
||||||
|
page_parts.append('<table class="pdf-table">')
|
||||||
|
for row_idx, row in enumerate(table):
|
||||||
|
page_parts.append('<tr>')
|
||||||
|
tag = 'th' if row_idx == 0 else 'td'
|
||||||
|
for cell in row:
|
||||||
|
cell_text = self._escape_html(str(cell)) if cell is not None else ''
|
||||||
|
page_parts.append(f'<{tag}>{cell_text}</{tag}>')
|
||||||
|
page_parts.append('</tr>')
|
||||||
|
page_parts.append('</table>')
|
||||||
|
|
||||||
|
page_html = '\n'.join(page_parts)
|
||||||
sections.append(
|
sections.append(
|
||||||
f'<section class="pdf-page" data-page="{i + 1}" '
|
f'<section class="pdf-page" data-page="{i + 1}">'
|
||||||
'style="margin-bottom:1.5em;padding-bottom:1em;border-bottom:1px solid #e5e5e5;">'
|
f'<div class="pdf-page-num">第 {i + 1} 页</div>'
|
||||||
f'<div style="font-size:12px;color:#888;margin-bottom:8px;">第 {i + 1} 页</div>'
|
f'{page_html}</section>'
|
||||||
f"{page_html}</section>"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
css = '''<style>
|
css = '''<style>
|
||||||
.pdf-preview { font-family: system-ui, -apple-system, sans-serif; line-height: 1.6; max-width: 100%; }
|
.pdf-preview { font-family: "PingFang SC", "Microsoft YaHei", system-ui, sans-serif; line-height: 1.8; color: #333; }
|
||||||
.pdf-preview > div { width: 100% !important; max-width: 100% !important; }
|
.pdf-page { margin-bottom: 1.5em; padding-bottom: 1em; border-bottom: 1px solid #e5e5e5; }
|
||||||
.pdf-preview img { max-width: 100%; height: auto; }
|
.pdf-page-num { font-size: 12px; color: #999; margin-bottom: 8px; }
|
||||||
.pdf-preview p { margin: 0.3em 0; }
|
.pdf-preview p { margin: 0.3em 0; font-size: 15px; text-indent: 0; }
|
||||||
.pdf-preview span { line-height: 1.5; }
|
.pdf-preview h3 { font-size: 16px; font-weight: bold; margin: 1em 0 0.4em; color: #222; }
|
||||||
.pdf-page > div { width: 100% !important; max-width: 100% !important; }
|
.pdf-table { border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 14px; }
|
||||||
|
.pdf-table th, .pdf-table td { border: 1px solid #d0d0d0; padding: 6px 10px; text-align: left; vertical-align: top; }
|
||||||
|
.pdf-table th { background: #f5f7fa; font-weight: bold; }
|
||||||
|
.pdf-table tr:nth-child(even) { background: #fafbff; }
|
||||||
</style>'''
|
</style>'''
|
||||||
|
|
||||||
if not any_text:
|
if not any_text:
|
||||||
wrapper = (
|
wrapper = (
|
||||||
f'{css}<div class="pdf-preview">'
|
f'{css}<div class="pdf-preview">'
|
||||||
"<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>"
|
'<p><em>(未能从 PDF 提取到文本,可能是扫描件或加密文档。)</em></p></div>'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
wrapper = (
|
wrapper = (
|
||||||
|
|||||||
@@ -4,9 +4,10 @@ import urllib
|
|||||||
from fastapi import File, Form, Body, Query, Response, UploadFile
|
from fastapi import File, Form, Body, Query, Response, UploadFile
|
||||||
from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
|
from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
|
||||||
VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
|
VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
|
||||||
EXPR,
|
EXPR,
|
||||||
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
|
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
|
||||||
logger, log_verbose, POLICY_KNOWLEDGE_BASE)
|
logger, log_verbose, POLICY_KNOWLEDGE_BASE)
|
||||||
|
from configs.kb_config import ck_mysql_config
|
||||||
from configs.model_config import LLM_MODELS
|
from configs.model_config import LLM_MODELS
|
||||||
from server.knowledge_base.cleanpdf import PdfConverter
|
from server.knowledge_base.cleanpdf import PdfConverter
|
||||||
from server.knowledge_base.file_converter import FileConverter
|
from server.knowledge_base.file_converter import FileConverter
|
||||||
|
|||||||
Reference in New Issue
Block a user