Files
gangyan/langchain-chat/server/knowledge_base/cleanpdf.py

373 lines
17 KiB
Python
Raw Normal View History

import os
import re
from typing import Optional
from bs4 import BeautifulSoup
from collections import defaultdict
import cssutils
from server.knowledge_base.file_converter import FileConverter
import uuid
import base64
class PdfConverter(FileConverter):
def _clean_pdf_html(self, html: str) -> str:
"""HTML后处理方法"""
soup = BeautifulSoup(html, 'html.parser')
# 处理样式表中的CSS规则
def process_rule(rule):
if rule.type == rule.MEDIA_RULE:
for nested_rule in rule:
process_rule(nested_rule)
elif rule.type == rule.STYLE_RULE:
# 移除文本选择限制属性
for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']:
rule.style.removeProperty(prop)
# 原有处理逻辑保持不变
if any('#page-container-1' in selector.selectorText for selector in rule.selectorList):
rule.style.removeProperty('background-color')
rule.style.removeProperty('background-image')
if any(re.search(r'(^|[\s>+~])\.pf($|[\s\[.:>+~])', selector.selectorText)
for selector in rule.selectorList):
for prop in ['box-shadow', 'border-collapse']:
for _ in range(3):
if rule.style.removeProperty(prop):
break
# 处理内联样式
def clean_inline_styles(tag):
if tag.has_attr('style'):
style = cssutils.parseStyle(tag['style'])
# 移除文本选择限制属性
for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']:
style.removeProperty(prop)
# 原有处理逻辑保持不变
if tag.get('id') == 'page-container-1':
style.removeProperty('background-color')
style.removeProperty('background-image')
if 'pf' in tag.get('class', []):
style.removeProperty('box-shadow')
style.removeProperty('border-collapse')
tag['style'] = style.cssText.replace('\n', ' ').strip()
if not tag['style']:
del tag['style']
# 清理空的和仅含空格的span标签
for span in soup.find_all('span'):
# 判断是否包含可见内容
if not span.text.strip():
span.decompose()
else:
# 清理内部的空白字符
if span.string and span.string.isspace():
span.string.replace_with(' ')
# 处理包含多个空白文本节点的情况
elif all(isinstance(c, str) and c.isspace() for c in span.contents):
span.replace_with(' ')
# 原有处理流程
for style_tag in soup.find_all('style'):
if style_tag.string:
try:
sheet = cssutils.parseString(style_tag.string)
for rule in sheet:
process_rule(rule)
style_tag.string = sheet.cssText.decode('utf-8')\
.replace('\\n', '\n')\
.replace(' !important', '!important')
except Exception as e:
print(f"CSS处理错误: {str(e)}")
continue
for container in soup.select('#page-container-1'):
clean_inline_styles(container)
for pf_element in soup.select('.pf'):
clean_inline_styles(pf_element)
content = str(soup)
content = self._add_pdf_element_ids(content)
if hasattr(self, 'page_container_id') and self.page_container_id:
new_id = self.page_container_id
head_pattern = re.compile(
r'(<head[^>]*>)(.*?)(</head>)',
re.DOTALL | re.IGNORECASE
)
def replace_head(match):
head_content = match.group(2)
head_content = re.sub(
r'(id\s*=\s*["\']?)page-container(["\'\]>])',
f'\\g<1>{new_id}\\g<2>',
head_content
)
head_content = re.sub(
r'(#[^{\s>]+?{.*?)(\bbackground-(color|image)\s*:[^;]+;?)',
lambda m: m.group(1) if m.group(2) else m.group(0),
head_content,
flags=re.DOTALL|re.IGNORECASE
)
return f"{match.group(1)}{head_content}{match.group(3)}"
content = head_pattern.sub(replace_head, content)
content = re.sub(
r'<script\b[^>]*>[\s\S]*?</script>',
'',
content,
flags=re.IGNORECASE
)
return content.strip()
def _add_pdf_element_ids(self, content: str) -> str:
"""为元素添加唯一ID"""
counters = defaultdict(int)
self.page_container_id = None # 重置ID记录
def replace_tag(match):
tag = match.group(1).lower()
attrs = match.group(2)
# 处理page-container的特殊逻辑
if tag == "div":
id_match = re.search(
r'\bid\s*=\s*["\']page-container["\']',
attrs,
flags=re.IGNORECASE
)
if id_match:
# 生成唯一ID并记录
if not self.page_container_id:
counters['page-container'] += 1
self.page_container_id = f"page-container-{counters['page-container']}"
# 保留其他属性
clean_attrs = re.sub(r'\s+id="[^"]*"', '', attrs)
return f'<div id="{self.page_container_id}"{clean_attrs}>'
# 常规标签处理
counters[tag] += 1
clean_attrs = re.sub(r'\s+id="[^"]*"', '', attrs)
return f'<{tag} id="{tag}-{counters[tag]}"{clean_attrs}>'
# 处理所有目标标签
return re.sub(
r'<(h[1-6]|p|div|span)(\b[^>]*)>',
replace_tag,
content,
flags=re.IGNORECASE
)
def _save_pdf_html(self, content: str, output_path: Optional[str] = None) -> str:
"""统一保存方法"""
cleaned = self._clean_pdf_html(content)
# cleaned = self._add_pdf_element_ids(content)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(cleaned)
return cleaned
# def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
# """PDF转换方法"""
# cmd = [
# 'pdf2htmlEX',
# '--zoom', '1.2', # 放大
# '--split-pages', '0', # 保持整体布局
# # '--embed-css', '0', # 避免内联样式冲突
# # '--embed-image', '0', # 避免内联图片冲突
# # '--optimize-text', '1', # 优化文本渲染
# input_path
# ]
# result = subprocess.run(
# 'cd /data3/pdffiles && ' + ' '.join(cmd),
# shell=True,
# stdout=subprocess.PIPE,
# stderr=subprocess.STDOUT
# )
# print(f"转换状态: {result.returncode}\n输出: {result.stdout.decode()[:200]}")
# # 准备文件名
# file_name = os.path.basename(input_path)[:-3] + "html"
# html_path = f"/data3/pdffiles/{file_name}"
# if not os.path.exists(html_path):
# return f"{file_name} 转换失败"
# # 读取并处理HTML内容
# with open(html_path, 'r', encoding='utf-8') as file:
# soup = BeautifulSoup(file, 'html.parser')
# # 移除注释
# for comment in soup.find_all(string=lambda text: isinstance(text, str) and "Created by pdf2htmlEX" in text):
# comment.extract()
# # 移除loading-indicator
# for div in soup.find_all('div', class_='loading-indicator'):
# div.decompose()
# # 移除所有包含sidebar的div
# for div in soup.find_all('div', id=lambda x: x and 'sidebar' in x.lower()):
# div.decompose()
# # 转换为字符串并处理base64
# html_content = str(soup)
# # 清理临时文件
# os.remove(html_path)
# # 处理base64图片
# html_content = self.read_and_replace_base64(
# html_content,
# output_dir={GENERATED_IMAGES_BASE_PATH}
# )
# return f"{self._save_pdf_html(html_content, output_path)}"
def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
"""PDF 预览:与基类一致(本机 PyMuPDF 抽文本)。如需后处理可在此包装 super() 结果。"""
return super().pdf_to_html(input_path, output_path)
def read_and_replace_base64(self,html_content, output_dir):
image_index = 0 # 用于生成唯一的文件名
def replace_base64(match):
nonlocal image_index
base64_data = match.group(0)
# 保存 Base64 图片并获取文件路径
# 提取文件类型和实际的 Base64 数据
header, data = base64_data.split(',', 1)
file_extension = header.split(';')[0].split('/')[1] # 获取文件扩展名
file_name = f'image_{uuid.uuid1()}_{image_index}.{file_extension}' # 生成文件名
file_path = os.path.join(output_dir, file_name)
# 将 Base64 数据解码并保存为文件
with open(file_path, 'wb') as image_file:
image_file.write(base64.b64decode(data))
image_index += 1
# 返回文件的 URL
return f"http://127.0.0.1:8099/chat_web_backend/get-image?file_name={os.path.basename(file_path)}"
# 使用正则表达式匹配 Base64 字符串
base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+'
# base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+|data:application/font-woff;base64,[A-Za-z0-9+/=]+'
updated_html_content = re.sub(base64_pattern, replace_base64, html_content)
return updated_html_content
# def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
# """PDF转换方法"""
# try:
# doc = fitz.open(input_path)
# page_width = doc[0].rect.width
# page_height = doc[0].rect.height
# border_radius = 5
# html = ['<style>','pre { background-color: #2d2d2d;color: #f8f8f2; padding: 10px;margin: 0;width: 80%;box-sizing: border-box;border-radius: 0px;}', '</style>', '<body style="position: relative;">']
# image_save_path = '{GENERATED_IMAGES_BASE_PATH}'
# pic_num =0
# # 确保图片保存路径存在
# os.makedirs(image_save_path, exist_ok=True)
# for page in doc:
# blocks = page.get_text("dict")["blocks"]
# sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"][1], b["bbox"][0])) # 按y坐标和x坐标排序
# for block in sorted_blocks:
# if "image" in block:
# pic_num += 1
# bbox = block["bbox"]
# image_bytes = block["image"]
# image_ext = block["ext"]
# image_name = f'image_{page.number}_{pic_num}.{image_ext}'
# image_url = f'http://127.0.0.1:8099/chat_web_backend/get-image?file_name={image_name}'
# image_path = os.path.join(image_save_path, image_name)
# # 保存图片到指定路径
# with open(image_path, 'wb') as img_file:
# img_file.write(image_bytes)
# percent_left = (bbox[0]) / page_width * 100
# # 获取页面的宽度和高度
# container_width = page_width # 页面宽度
# container_height = page_height # 页面高度
# # 计算图像的宽度和高度
# img_width = bbox[2] - bbox[0] # 计算宽度
# img_height = bbox[3] - bbox[1] # 计算高度
# # 计算百分比
# width_percent = (img_width / container_width) * 100
# height_percent = (img_height / container_height) * 100
# html.append(f'<div style="width: {width_percent}%; height: {height_percent}%; margin-left: {percent_left}%;clear: both;overflow: auto;"><img src="{image_url}" alt="Image {pic_num}" style="max-width: 100%; height: auto;display: block;"/></div>')
# if "lines" in block:
# text_nums = 0
# for line in block["lines"]:
# is_code_block =any(span["font"].startswith(("Courier", "NSimSun")) for span in line["spans"]) # 假设代码使用Courier字体
# if is_code_block:
# html.append(f"<pre>")
# for span in line["spans"]:
# text_nums += 1
# bbox = span["bbox"]
# text = span["text"]
# font = span["font"] # 字体
# size = span["size"] # 字体大小
# color = span["color"] # 字体颜色
# # 动态生成CSS样式
# css_style = f'font-family: {font}; font-size: {size}px; color: #{color:06x};'
# percent_left = (bbox[0]) / page_width * 100
# # 根据字体大小判断标题
# if size > 20: # 假设大于20的字体为标题
# if text_nums == 1:
# html.append(f'<h2 style="{css_style};display: inline; margin-left: {percent_left}%; ">{text.strip()}</h2>')
# else:
# html.append(f'<h3 style="{css_style};display: inline;">{text.strip()}</h3>')
# else:
# if text_nums == 1:
# html.append(f'<p style="{css_style};display: inline; margin-left: {percent_left}%; ">{text.strip()}</p>')
# else:
# html.append(f'<p style="{css_style};display: inline; ">{text.strip()}</p>')
# if is_code_block or size<=20:
# if is_code_block:
# html.append("</pre>")
# else:
# html.append("<br>")
# else:
# html.append('<br>')
# # html.append('<br>')
# html.append('</body>')
# # 将HTML内容保存到指定路径
# html_content = ''.join(html)
# if output_path:
# with open(output_path, 'w', encoding='utf-8') as file:
# file.write(html_content)
# else:
# # 如果没有指定路径使用默认路径或返回HTML内容
# output_path = 'output.html'
# with open(output_path, 'w', encoding='utf-8') as file:
# file.write(html_content)
# return output_path
# except Exception as e:
# raise RuntimeError(f"PDF转换失败: {str(e)}")
# def replace_base64_with_url(self,html_content, output_dir):
# image_index = 0 # 用于生成唯一的文件名
# def replace_base64(match):
# nonlocal image_index
# base64_data = match.group(0)
# # 保存 Base64 图片并获取文件路径
# # 提取文件类型和实际的 Base64 数据
# header, data = base64_data.split(',', 1)
# file_extension = header.split(';')[0].split('/')[1] # 获取文件扩展名
# file_name = f'image_{uuid.uuid1()}_{image_index}.{file_extension}' # 生成文件名
# file_path = os.path.join(output_dir, file_name)
# # 将 Base64 数据解码并保存为文件
# with open(file_path, 'wb') as image_file:
# image_file.write(base64.b64decode(data))
# image_index += 1
# # 返回文件的 URL
# return f"http://127.0.0.1:8099/chat_web_backend/get-image?file_name={os.path.basename(file_path)}"
# # 使用正则表达式匹配 Base64 字符串
# base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+'
# updated_html_content = re.sub(base64_pattern, replace_base64, html_content)
# return updated_html_content