gangyan/langchain-chat/server/knowledge_base/cleanpdf.py

import os
import re
from typing import Optional
from bs4 import BeautifulSoup
from collections import defaultdict
import cssutils
from server.knowledge_base.file_converter import FileConverter
import uuid
import base64

class PdfConverter(FileConverter):
    def _clean_pdf_html(self, html: str) -> str:
        """HTML后处理方法"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # 处理样式表中的CSS规则
        def process_rule(rule):
            if rule.type == rule.MEDIA_RULE:
                for nested_rule in rule:
                    process_rule(nested_rule)
            elif rule.type == rule.STYLE_RULE:
                # 移除文本选择限制属性
                for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']:
                    rule.style.removeProperty(prop)
                
                # 原有处理逻辑保持不变
                if any('#page-container-1' in selector.selectorText for selector in rule.selectorList):
                    rule.style.removeProperty('background-color')
                    rule.style.removeProperty('background-image')
                
                if any(re.search(r'(^|[\s>+~])\.pf($|[\s\[.:>+~])', selector.selectorText)
                        for selector in rule.selectorList):
                    for prop in ['box-shadow', 'border-collapse']:
                        for _ in range(3):
                            if rule.style.removeProperty(prop):
                                break

        # 处理内联样式
        def clean_inline_styles(tag):
            if tag.has_attr('style'):
                style = cssutils.parseStyle(tag['style'])
                # 移除文本选择限制属性
                for prop in ['user-select', '-webkit-user-select', '-moz-user-select', '-ms-user-select']:
                    style.removeProperty(prop)
                # 原有处理逻辑保持不变
                if tag.get('id') == 'page-container-1':
                    style.removeProperty('background-color')
                    style.removeProperty('background-image')
                if 'pf' in tag.get('class', []):
                    style.removeProperty('box-shadow')
                    style.removeProperty('border-collapse')
                tag['style'] = style.cssText.replace('\n', ' ').strip()
                if not tag['style']:
                    del tag['style']

        # 清理空的和仅含空格的span标签
        for span in soup.find_all('span'):
            # 判断是否包含可见内容
            if not span.text.strip():
                span.decompose()
            else:
                # 清理内部的空白字符
                if span.string and span.string.isspace():
                    span.string.replace_with(' ')
                # 处理包含多个空白文本节点的情况
                elif all(isinstance(c, str) and c.isspace() for c in span.contents):
                    span.replace_with(' ')

        # 原有处理流程
        for style_tag in soup.find_all('style'):
            if style_tag.string:
                try:
                    sheet = cssutils.parseString(style_tag.string)
                    for rule in sheet:
                        process_rule(rule)
                    style_tag.string = sheet.cssText.decode('utf-8')\
                        .replace('\\n', '\n')\
                        .replace(' !important', '!important')
                except Exception as e:
                    print(f"CSS处理错误: {str(e)}")
                    continue

        for container in soup.select('#page-container-1'):
            clean_inline_styles(container)
        for pf_element in soup.select('.pf'):
            clean_inline_styles(pf_element)

        content = str(soup)
        content = self._add_pdf_element_ids(content)
        
        if hasattr(self, 'page_container_id') and self.page_container_id:
            new_id = self.page_container_id
            head_pattern = re.compile(
                r'(<head[^>]*>)(.*?)(</head>)', 
                re.DOTALL | re.IGNORECASE
            )
            def replace_head(match):
                head_content = match.group(2)
                head_content = re.sub(
                    r'(id\s*=\s*["\']?)page-container(["\'\]>])',
                    f'\\g<1>{new_id}\\g<2>',
                    head_content
                )
                head_content = re.sub(
                    r'(#[^{\s>]+?{.*?)(\bbackground-(color|image)\s*:[^;]+;?)',
                    lambda m: m.group(1) if m.group(2) else m.group(0),
                    head_content,
                    flags=re.DOTALL|re.IGNORECASE
                )
                return f"{match.group(1)}{head_content}{match.group(3)}"
            
            content = head_pattern.sub(replace_head, content)

        content = re.sub(
            r'<script\b[^>]*>[\s\S]*?</script>',
            '',
            content,
            flags=re.IGNORECASE
        )
        
        return content.strip()

    def _add_pdf_element_ids(self, content: str) -> str:
        """为元素添加唯一ID"""
        counters = defaultdict(int)
        self.page_container_id = None  # 重置ID记录
        
        def replace_tag(match):
            tag = match.group(1).lower()
            attrs = match.group(2)
            
            # 处理page-container的特殊逻辑
            if tag == "div":
                id_match = re.search(
                    r'\bid\s*=\s*["\']page-container["\']',
                    attrs,
                    flags=re.IGNORECASE
                )
                if id_match:
                    # 生成唯一ID并记录
                    if not self.page_container_id:
                        counters['page-container'] += 1
                        self.page_container_id = f"page-container-{counters['page-container']}"
                    # 保留其他属性
                    clean_attrs = re.sub(r'\s+id="[^"]*"', '', attrs)
                    return f'<div id="{self.page_container_id}"{clean_attrs}>'
            
            # 常规标签处理
            counters[tag] += 1
            clean_attrs = re.sub(r'\s+id="[^"]*"', '', attrs)
            return f'<{tag} id="{tag}-{counters[tag]}"{clean_attrs}>'
        
        # 处理所有目标标签
        return re.sub(
            r'<(h[1-6]|p|div|span)(\b[^>]*)>',
            replace_tag,
            content,
            flags=re.IGNORECASE
        )
    def _save_pdf_html(self, content: str, output_path: Optional[str] = None) -> str:
        """统一保存方法"""
        cleaned = self._clean_pdf_html(content)
        # cleaned = self._add_pdf_element_ids(content)
        if output_path:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(cleaned)
        return cleaned
    
    # def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
    #     """PDF转换方法"""
    #     cmd = [
    #         'pdf2htmlEX',
    #         '--zoom', '1.2', # 放大
    #         '--split-pages', '0', # 保持整体布局
    #         # '--embed-css', '0', # 避免内联样式冲突
    #         # '--embed-image', '0', # 避免内联图片冲突
    #         # '--optimize-text', '1', # 优化文本渲染
    #         input_path
    #     ]
    #     result = subprocess.run(
    #         'cd /data3/pdffiles && ' + ' '.join(cmd),
    #         shell=True,
    #         stdout=subprocess.PIPE,
    #         stderr=subprocess.STDOUT
    #     )
    #     print(f"转换状态: {result.returncode}\n输出: {result.stdout.decode()[:200]}")

    #     # 准备文件名
    #     file_name = os.path.basename(input_path)[:-3] + "html"
    #     html_path = f"/data3/pdffiles/{file_name}"

    #     if not os.path.exists(html_path):
    #         return f"{file_name} 转换失败"

    #     # 读取并处理HTML内容
    #     with open(html_path, 'r', encoding='utf-8') as file:
    #         soup = BeautifulSoup(file, 'html.parser')

    #         # 移除注释
    #         for comment in soup.find_all(string=lambda text: isinstance(text, str) and "Created by pdf2htmlEX" in text):
    #             comment.extract()

    #         # 移除loading-indicator
    #         for div in soup.find_all('div', class_='loading-indicator'):
    #             div.decompose()

    #         # 移除所有包含sidebar的div
    #         for div in soup.find_all('div', id=lambda x: x and 'sidebar' in x.lower()):
    #             div.decompose()

    #         # 转换为字符串并处理base64
    #         html_content = str(soup)
        
    #     # 清理临时文件
    #     os.remove(html_path)

    #     # 处理base64图片
    #     html_content = self.read_and_replace_base64(
    #         html_content, 
    #         output_dir={GENERATED_IMAGES_BASE_PATH}
    #     )

    #     return f"{self._save_pdf_html(html_content, output_path)}"
    
    def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
        """PDF 预览：与基类一致（本机 PyMuPDF 抽文本）。如需后处理可在此包装 super() 结果。"""
        return super().pdf_to_html(input_path, output_path)

    def read_and_replace_base64(self,html_content, output_dir):
        image_index = 0  # 用于生成唯一的文件名
        def replace_base64(match):
            nonlocal image_index
            base64_data = match.group(0)
            # 保存 Base64 图片并获取文件路径
            # 提取文件类型和实际的 Base64 数据
            header, data = base64_data.split(',', 1)
            file_extension = header.split(';')[0].split('/')[1]  # 获取文件扩展名
            file_name = f'image_{uuid.uuid1()}_{image_index}.{file_extension}'  # 生成文件名
            file_path = os.path.join(output_dir, file_name)

            # 将 Base64 数据解码并保存为文件
            with open(file_path, 'wb') as image_file:
                image_file.write(base64.b64decode(data))
            image_index += 1
            # 返回文件的 URL
            return f"http://127.0.0.1:8099/chat_web_backend/get-image?file_name={os.path.basename(file_path)}"

        # 使用正则表达式匹配 Base64 字符串
        base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+'
        # base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+|data:application/font-woff;base64,[A-Za-z0-9+/=]+'
        updated_html_content = re.sub(base64_pattern, replace_base64, html_content)
        return updated_html_content

    # def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
    #     """PDF转换方法"""
    #     try:
    #         doc = fitz.open(input_path)
    #         page_width = doc[0].rect.width
    #         page_height = doc[0].rect.height
    #         border_radius = 5
    #         html = ['<style>','pre { background-color: #2d2d2d;color: #f8f8f2; padding: 10px;margin: 0;width: 80%;box-sizing: border-box;border-radius: 0px;}', '</style>', '<body style="position: relative;">']
    #         image_save_path = '{GENERATED_IMAGES_BASE_PATH}'
    #         pic_num =0 
    #         # 确保图片保存路径存在
    #         os.makedirs(image_save_path, exist_ok=True)
            
    #         for page in doc:
    #             blocks = page.get_text("dict")["blocks"]
    #             sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"][1], b["bbox"][0]))  # 按y坐标和x坐标排序
                
    #             for block in sorted_blocks:
    #                 if "image" in block:
    #                     pic_num += 1
    #                     bbox = block["bbox"]
    #                     image_bytes = block["image"]
    #                     image_ext = block["ext"]
    #                     image_name = f'image_{page.number}_{pic_num}.{image_ext}'
    #                     image_url = f'http://127.0.0.1:8099/chat_web_backend/get-image?file_name={image_name}'
    #                     image_path = os.path.join(image_save_path, image_name)
    #                     # 保存图片到指定路径
    #                     with open(image_path, 'wb') as img_file:
    #                         img_file.write(image_bytes)
    #                     percent_left = (bbox[0]) / page_width * 100
    #                     # 获取页面的宽度和高度
    #                     container_width = page_width  # 页面宽度
    #                     container_height = page_height  # 页面高度

    #                     # 计算图像的宽度和高度
    #                     img_width = bbox[2] - bbox[0]  # 计算宽度
    #                     img_height = bbox[3] - bbox[1]  # 计算高度
                        
    #                     # 计算百分比
    #                     width_percent = (img_width / container_width) * 100
    #                     height_percent = (img_height / container_height) * 100
    #                     html.append(f'<div  style="width: {width_percent}%; height: {height_percent}%; margin-left: {percent_left}%;clear: both;overflow: auto;"><img src="{image_url}" alt="Image {pic_num}"  style="max-width: 100%; height: auto;display: block;"/></div>')
    #                 if "lines" in block:
    #                     text_nums = 0
    #                     for line in block["lines"]:
    #                         is_code_block =any(span["font"].startswith(("Courier", "NSimSun")) for span in line["spans"]) # 假设代码使用Courier字体
    #                         if is_code_block:
    #                             html.append(f"<pre>")
    #                         for span in line["spans"]:
    #                             text_nums += 1
    #                             bbox = span["bbox"]
    #                             text = span["text"]
    #                             font = span["font"]  # 字体
    #                             size = span["size"]  # 字体大小
    #                             color = span["color"]  # 字体颜色

    #                             # 动态生成CSS样式
    #                             css_style = f'font-family: {font}; font-size: {size}px; color: #{color:06x};'
    #                             percent_left = (bbox[0]) / page_width * 100
    #                             # 根据字体大小判断标题
    #                             if size > 20:  # 假设大于20的字体为标题
    #                                 if text_nums == 1:
    #                                     html.append(f'<h2 style="{css_style};display: inline; margin-left: {percent_left}%; ">{text.strip()}</h2>')
    #                                 else:
    #                                     html.append(f'<h3 style="{css_style};display: inline;">{text.strip()}</h3>')
    #                             else:
    #                                 if text_nums == 1:
    #                                     html.append(f'<p style="{css_style};display: inline; margin-left: {percent_left}%; ">{text.strip()}</p>')
    #                                 else:
    #                                     html.append(f'<p style="{css_style};display: inline; ">{text.strip()}</p>')
                            
    #                         if is_code_block or size<=20:
    #                             if is_code_block:
    #                                 html.append("</pre>")
    #                             else:
    #                                 html.append("<br>")
    #                         else:
    #                             html.append('<br>')
    #                         # html.append('<br>')
            
    #         html.append('</body>')
            
    #         # 将HTML内容保存到指定路径
    #         html_content = ''.join(html)
    #         if output_path:
    #             with open(output_path, 'w', encoding='utf-8') as file:
    #                 file.write(html_content)
    #         else:
    #             # 如果没有指定路径，使用默认路径或返回HTML内容
    #             output_path = 'output.html'
    #             with open(output_path, 'w', encoding='utf-8') as file:
    #                 file.write(html_content)
            
    #         return output_path
    #     except Exception as e:
    #         raise RuntimeError(f"PDF转换失败: {str(e)}")
    # def replace_base64_with_url(self,html_content, output_dir): 
    #     image_index = 0  # 用于生成唯一的文件名
    #     def replace_base64(match):
    #         nonlocal image_index
    #         base64_data = match.group(0)
    #         # 保存 Base64 图片并获取文件路径
    #         # 提取文件类型和实际的 Base64 数据
    #         header, data = base64_data.split(',', 1)
    #         file_extension = header.split(';')[0].split('/')[1]  # 获取文件扩展名
    #         file_name = f'image_{uuid.uuid1()}_{image_index}.{file_extension}'  # 生成文件名
    #         file_path = os.path.join(output_dir, file_name)

    #         # 将 Base64 数据解码并保存为文件
    #         with open(file_path, 'wb') as image_file:
    #             image_file.write(base64.b64decode(data))
    #         image_index += 1
    #         # 返回文件的 URL
    #         return f"http://127.0.0.1:8099/chat_web_backend/get-image?file_name={os.path.basename(file_path)}"

    #     # 使用正则表达式匹配 Base64 字符串
    #     base64_pattern = r'data:image/(png|jpg|jpeg);base64,[A-Za-z0-9+/=]+'
    #     updated_html_content = re.sub(base64_pattern, replace_base64, html_content)
    #     return updated_html_content