import os import re from typing import Optional from bs4 import BeautifulSoup from collections import defaultdict import cssutils from server.knowledge_base.file_converter import FileConverter import uuid import base64 class PdfConverter(FileConverter): def _clean_pdf_html(self, html: str) -> str: """HTML后处理方法""" soup = BeautifulSoup(html, 'html.parser') # 处理样式表中的CSS规则 def process_rule(rule): if rule.type == rule.MEDIA_RULE: for nested_rule in rule: process_rule(nested_rule) elif rule.type == rule.STYLE_RULE: # 移除文本选择限制属性 for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']: rule.style.removeProperty(prop) # 原有处理逻辑保持不变 if any('#page-container-1' in selector.selectorText for selector in rule.selectorList): rule.style.removeProperty('background-color') rule.style.removeProperty('background-image') if any(re.search(r'(^|[\s>+~])\.pf($|[\s\[.:>+~])', selector.selectorText) for selector in rule.selectorList): for prop in ['box-shadow', 'border-collapse']: for _ in range(3): if rule.style.removeProperty(prop): break # 处理内联样式 def clean_inline_styles(tag): if tag.has_attr('style'): style = cssutils.parseStyle(tag['style']) # 移除文本选择限制属性 for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']: style.removeProperty(prop) # 原有处理逻辑保持不变 if tag.get('id') == 'page-container-1': style.removeProperty('background-color') style.removeProperty('background-image') if 'pf' in tag.get('class', []): style.removeProperty('box-shadow') style.removeProperty('border-collapse') tag['style'] = style.cssText.replace('\n', ' ').strip() if not tag['style']: del tag['style'] # 清理空的和仅含空格的span标签 for span in soup.find_all('span'): # 判断是否包含可见内容 if not span.text.strip(): span.decompose() else: # 清理内部的空白字符 if span.string and span.string.isspace(): span.string.replace_with(' ') # 处理包含多个空白文本节点的情况 elif all(isinstance(c, str) and c.isspace() for c in span.contents): span.replace_with(' ') # 原有处理流程 for style_tag in soup.find_all('style'): if style_tag.string: try: sheet = cssutils.parseString(style_tag.string) for rule in sheet: process_rule(rule) style_tag.string = sheet.cssText.decode('utf-8')\ .replace('\\n', '\n')\ .replace(' !important', '!important') except Exception as e: print(f"CSS处理错误: {str(e)}") continue for container in soup.select('#page-container-1'): clean_inline_styles(container) for pf_element in soup.select('.pf'): clean_inline_styles(pf_element) content = str(soup) content = self._add_pdf_element_ids(content) if hasattr(self, 'page_container_id') and self.page_container_id: new_id = self.page_container_id head_pattern = re.compile( r'(
]*>)(.*?)()', re.DOTALL | re.IGNORECASE ) def replace_head(match): head_content = match.group(2) head_content = re.sub( r'(id\s*=\s*["\']?)page-container(["\'\]>])', f'\\g<1>{new_id}\\g<2>', head_content ) head_content = re.sub( r'(#[^{\s>]+?{.*?)(\bbackground-(color|image)\s*:[^;]+;?)', lambda m: m.group(1) if m.group(2) else m.group(0), head_content, flags=re.DOTALL|re.IGNORECASE ) return f"{match.group(1)}{head_content}{match.group(3)}" content = head_pattern.sub(replace_head, content) content = re.sub( r'', '', content, flags=re.IGNORECASE ) return content.strip() def _add_pdf_element_ids(self, content: str) -> str: """为元素添加唯一ID""" counters = defaultdict(int) self.page_container_id = None # 重置ID记录 def replace_tag(match): tag = match.group(1).lower() attrs = match.group(2) # 处理page-container的特殊逻辑 if tag == "div": id_match = re.search( r'\bid\s*=\s*["\']page-container["\']', attrs, flags=re.IGNORECASE ) if id_match: # 生成唯一ID并记录 if not self.page_container_id: counters['page-container'] += 1 self.page_container_id = f"page-container-{counters['page-container']}" # 保留其他属性 clean_attrs = re.sub(r'\s+id="[^"]*"', '', attrs) return f'")
# for span in line["spans"]:
# text_nums += 1
# bbox = span["bbox"]
# text = span["text"]
# font = span["font"] # 字体
# size = span["size"] # 字体大小
# color = span["color"] # 字体颜色
# # 动态生成CSS样式
# css_style = f'font-family: {font}; font-size: {size}px; color: #{color:06x};'
# percent_left = (bbox[0]) / page_width * 100
# # 根据字体大小判断标题
# if size > 20: # 假设大于20的字体为标题
# if text_nums == 1:
# html.append(f'{text.strip()}
')
# else:
# html.append(f'{text.strip()}
')
# else:
# if text_nums == 1:
# html.append(f'{text.strip()}
')
# else:
# html.append(f'{text.strip()}
')
# if is_code_block or size<=20:
# if is_code_block:
# html.append("")
# else:
# html.append("