import base64 from datetime import datetime from html import escape import os import re import subprocess import tempfile import uuid import docx import markdown import fitz # PyMuPDF from docx import Document from typing import Optional from collections import defaultdict import zipfile from lxml import etree from docx import Document from docx.oxml import parse_xml from io import BytesIO import base64 import os import xml.etree.ElementTree as ET import openpyxl import pandas as pd import xlrd from openpyxl.styles import Font, PatternFill from configs.kb_config import ( GENERATED_IMAGES_BASE_PATH, IMAGE_SERVER_URL_TEMPLATE, PDF_CONVERT_KB_ROOT, ) NS = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', 'v': 'urn:schemas-microsoft-com:vml', 'pkg': 'http://schemas.openxmlformats.org/package/2006/relationships', 'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape', 'o': 'urn:schemas-microsoft-com:office:office', 'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006', } class FileConverter: def __init__(self, libreoffice_path: str = "libreoffice"): self.libreoffice_path = libreoffice_path self._default_image_dir = "/home/albert/Documents/docx_images" def _clean_html(self, html: str) -> str: """HTML后处理方法""" # 提取body内容 body_match = re.search(r'
]*>(.*?)', html, re.DOTALL) content = body_match.group(1) if body_match else html # 清理不需要的标签和属性 content = re.sub(r'{line}
' for line in content.splitlines() if line.strip()] return self._save_html(f'{"".join(paragraphs)}', output_path) except Exception as e: raise RuntimeError(f"文本转换失败: {str(e)}") def doc_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: """DOC转换方法""" try: with tempfile.TemporaryDirectory() as temp_dir: # 转换DOC到DOCX cmd = [ self.libreoffice_path, '--headless', '--convert-to', 'docx', '--outdir', temp_dir, input_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"LibreOffice错误: {result.stderr}") # 获取转换后的DOCX路径 base_name = os.path.splitext(os.path.basename(input_path))[0] converted_docx = os.path.join(temp_dir, f"{base_name}.docx") if not os.path.exists(converted_docx): raise FileNotFoundError("转换后的DOCX文件未找到") # 使用DOCX处理流程 return self.docx_to_html(converted_docx, output_path) except Exception as e: raise RuntimeError(f"DOC转换失败: {str(e)}") # def docx_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: # """DOCX转换方法""" # try: # doc = Document(input_path) # html = [''] # # 按文档顺序处理所有元素 # for element in doc.element.body: # # 处理段落 # if element.tag.endswith('p'): # para = docx.text.paragraph.Paragraph(element, doc) # if not para.text.strip(): # continue # style_name = getattr(para.style, "name", None) # if style_name and style_name.startswith('Heading'): # level = min(int(para.style.name[-1]), 6) # html.append(f'{para.text}
') # # 处理表格 # elif element.tag.endswith('tbl'): # table = docx.table.Table(element, doc) # # 添加表格容器 # html.append('| {cell.text} | ') # html.append('
| ]*)>', r' | ',
html,
flags=re.IGNORECASE
)
return html
def _preserve_formulas(self, md_content: str) -> tuple:
"""公式预处理:将公式替换为唯一占位符"""
formula_map = {}
# 匹配块级公式 $$...$$
def block_replace(match):
formula_id = uuid.uuid4().hex
formula_map[formula_id] = match.group(0)
return f'\n\nFORMULA_BLOCK_{formula_id}\n\n'
# 匹配行内公式 $...$
def inline_replace(match):
formula_id = uuid.uuid4().hex
formula_map[formula_id] = match.group(0)
return f'FORMULA_INLINE_{formula_id}'
# 按顺序处理块级公式和行内公式
processed_content = re.sub(
r'\$\$(.*?)\$\$',
block_replace,
md_content,
flags=re.DOTALL
)
processed_content = re.sub(
r'(? str:
"""公式还原:将占位符替换回原始公式内容"""
# 输入校验
if not isinstance(html, str) or not isinstance(formula_map, dict):
raise ValueError("参数类型错误:html 必须是字符串,formula_map 必须是字典")
# 定义通用的替换函数
def replace_formula(match):
key = match.group(1)
return formula_map.get(key, f"{{UNKNOWN_FORMULA_{key}}}") # 防止公式丢失时静默失败
# 块级公式还原
html = re.sub(r'FORMULA_BLOCK_([a-f0-9]{32})', replace_formula, html)
# 行内公式还原
html = re.sub(r'FORMULA_INLINE_([a-f0-9]{32})', replace_formula, html)
return html
def docx_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
# 初始化参数
image_dir = self._default_image_dir
os.makedirs(image_dir, exist_ok=True)
# 读取DOCX文件
doc = Document(input_path)
html = ['']
# 处理批注和注释
comment_result = self._extract_comments(input_path)
active_comments = comment_result["active_comments"]
deleted_comments = comment_result["deleted_comments"]
if active_comments:
html.append('删除:') for comment_id, comment_info in deleted_comments.items(): author = comment_info["author"] date = comment_info["date"] text = comment_info["text"] html.append(f'[{comment_id}] {author}({date})删除的内容: {text} ')
html.append('')
# 处理输出
html_str = '\n'.join(html)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_str)
return self._save_docx_html(html_str)
def _extract_comments(self, docx_path: str) -> dict:
comments = {}
deleted_comment_ids = set()
with zipfile.ZipFile(docx_path) as z:
# 提取批注基础信息
if 'word/comments.xml' in z.namelist():
comments_xml = z.read('word/comments.xml')
comments_tree = etree.fromstring(comments_xml)
for comm in comments_tree.xpath('//w:comment', namespaces=NS):
# 提取批注元数据
comm_id = comm.get(f'{{{NS["w"]}}}id')
author = comm.get(f'{{{NS["w"]}}}author', 'Unknown')
date_str = comm.get(f'{{{NS["w"]}}}date', '')
if date_str:
try:
# 尝试解析 ISO 8601 格式的日期
date = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
# 转换为更易读的格式,例如 'YYYY-MM-DD HH:MM:SS'
date = date.strftime('%Y-%m-%d %H:%M:%S')
except ValueError:
# 如果解析失败,保留原始字符串
date = date_str
else:
date = ''
text = ''.join(comm.xpath('.//w:t/text()', namespaces=NS))
# 存储批注信息(保留原始ID和新ID映射)
comments[comm_id] = {
"original_id": comm_id,
"new_id": str(int(comm_id) + 1), # 按需求调整ID生成逻辑
"author": author,
"date": date,
"text": text.strip(),
"deleted": False
}
# 检测被删除的批注操作
if 'word/document.xml' in z.namelist():
doc_xml = z.read('word/document.xml')
doc_tree = etree.fromstring(doc_xml)
# 查找所有删除修订中的批注引用
for del_ref in doc_tree.xpath('//w:del//w:commentReference', namespaces=NS):
deleted_id = del_ref.get(f'{{{NS["w"]}}}id')
deleted_comment_ids.add(deleted_id)
# 标记已删除批注
for comm_id in deleted_comment_ids:
if comm_id in comments:
comments[comm_id]["deleted"] = True
# 返回结构化结果
return {
"active_comments": {
v["new_id"]: v for v in comments.values() if not v["deleted"]
},
"deleted_comments": {
v["new_id"]: v for v in comments.values() if v["deleted"]
}
}
def _process_paragraph(self, para, zip_file, image_dir, rels_tree, in_textbox=False) -> str:
for ac in para.xpath('.//mc:AlternateContent', namespaces=NS):
for node in ac.xpath('.//mc:Choice | .//mc:Fallback', namespaces=NS):
para.addprevious(node) # 把内容“提升”到 para 直接子节点
ac.getparent().remove(ac)
fragments = []
if not in_textbox:
# — VML 文本框 —
shapes_vml = para.xpath(
'.//w:pict//v:shape[v:textbox] | .//w:pict//v:rect[v:textbox]',
namespaces=NS
)
for shape in shapes_vml:
style = (shape.get('style') or '').rstrip(';') + ';'
fill = shape.find('.//v:fill', namespaces=NS)
img_url = None
if fill is not None:
relid = fill.get(f"{{{NS['o']}}}relid") or fill.get('src')
if relid:
img_url = (relid.startswith('rId')
and self._save_image_by_rid(relid, zip_file, image_dir)
or relid)
bg = f'background-image:url({img_url});background-size:cover;' if img_url else ''
fragments.append(f'')
for txbx in shape.xpath('.//v:textbox//w:txbxContent', namespaces=NS):
for p in txbx.xpath('.//w:p', namespaces=NS):
fragments.append(
self._process_paragraph(p, zip_file, image_dir, rels_tree, True)
)
fragments.append(' ')
# — DML 文本框 —
shapes_dml = para.xpath(
'.//w:drawing//wps:wsp[wps:txbx]',
namespaces=NS
)
for wsp in shapes_dml:
blip = wsp.find('.//wps:spPr/a:blipFill/a:blip', namespaces=NS)
img_url = None
if blip is not None:
rid = blip.get(f"{{{NS['r']}}}embed")
if rid:
img_url = self._save_image_by_rid(rid, zip_file, image_dir)
bg = f'background-image:url({img_url});background-size:cover;' if img_url else ''
fragments.append(f'')
for txbx in wsp.xpath('.//w:txbxContent', namespaces=NS):
for p in txbx.xpath('.//w:p', namespaces=NS):
fragments.append(
self._process_paragraph(p, zip_file, image_dir, rels_tree, True)
)
fragments.append(' ')
# 若有任何文本框内容,先返回
if fragments:
return ''.join(fragments)
# — 普通段落逻辑 —
p_props = para.xpath('.//w:pPr', namespaces=NS)
p_style = self._apply_paragraph_styles(p_props[0]) if p_props else ""
p_html = [f'']
for run in para.xpath('.//w:r', namespaces=NS):
text = ''.join(run.xpath('.//w:t/text()', namespaces=NS))
run_props = run.xpath('.//w:rPr', namespaces=NS)
if run_props:
text = self._apply_text_styles(run_props[0], text)
if run_props[0].xpath('.//w:strike', namespaces=NS):
text = f'
(本页无文本内容) ' parts: list[str] = [] for line in text.splitlines(): if line.strip(): parts.append(f"{escape(line)} ") else: parts.append("") return "".join(parts) if parts else ' (本页无文本内容) ' @staticmethod def _escape_html(text: str) -> str: """HTML 转义""" return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') def pdf_to_html(self, input_path: str, output_path: Optional[str] = None) -> str: """PDF 预览:使用 pdfplumber 提取文本和表格,生成干净的 HTML。""" allowed_pdf_root = os.path.abspath(PDF_CONVERT_KB_ROOT) abs_input = os.path.abspath(input_path) if abs_input != allowed_pdf_root and not abs_input.startswith(allowed_pdf_root + os.sep): return ( f"PDF路径不在知识库根目录下: input={input_path!r}, root={allowed_pdf_root!r}。" "可设置环境变量 PDF_CONVERT_KB_ROOT。" ) if not os.path.isfile(abs_input): return f"PDF文件不存在: {abs_input}" if os.path.splitext(abs_input)[1].lower() != ".pdf": return "不是 PDF 文件" try: import pdfplumber import re sections: list[str] = [] any_text = False with pdfplumber.open(abs_input) as pdf: for i, page in enumerate(pdf.pages): page_parts: list[str] = [] # 提取表格 tables = page.extract_tables() table_bboxes = [] if tables: for tbl_settings in page.find_tables(): table_bboxes.append(tbl_settings.bbox) # 提取文本(排除表格区域的文本) text = page.extract_text() or "" if text.strip(): any_text = True lines = text.split('\n') # 合并连续非空行为段落,空行分段,标题行独立 current_para = [] for line in lines: stripped = line.strip() if not stripped: # 空行 → 结束当前段落 if current_para: page_parts.append(f'{self._escape_html("".join(current_para))} ') current_para = [] continue # 标题检测 is_heading = (len(stripped) < 30 and not stripped.endswith(('。', ',', ';', '、', ':', ',', '.', ';')) and not stripped.startswith(('(', '(')) and re.match(r'^[一二三四五六七八九十\d]+[、..]', stripped)) if is_heading: # 先输出累积的段落 if current_para: page_parts.append(f'{self._escape_html("".join(current_para))} ') current_para = [] page_parts.append(f'{self._escape_html(stripped)}') else: current_para.append(stripped) # 输出最后一个段落 if current_para: page_parts.append(f'{self._escape_html("".join(current_para))} ') # 渲染表格 for table in tables: if not table: continue page_parts.append('第 {i + 1} 页 '
f'{page_html}'
' '
)
else:
wrapper = (
f'{css}(未能从 PDF 提取到文本,可能是扫描件或加密文档。) '
f"{''.join(sections)} "
)
return self._save_html(f"{wrapper}", output_path)
except Exception as e:
return f"PDF预览生成失败: {str(e)}"
def get_cell_style(self, cell, mode='xlsx', xls_book=None):
"""
获取单元格的 CSS style。支持 xlsx/xls:
- 字体颜色(RGB | 英文名)
- 背景色
- 下划线 + 删除线
- 上标/下标
- 加粗/斜体/字体/字号
mode: 'xlsx' or 'xls'
cell: openpyxl.cell.Cell 或 xlrd.sheet.Cell
xls_book: xlrd.book.Book,仅 mode='xls' 时传
"""
styles = []
if mode == 'xlsx':
# ============ openpyxl ============
font = cell.font
fill = cell.fill
# — 字体颜色 —
if font and font.color and getattr(font.color, 'type', None) == 'rgb' and font.color.rgb:
rgb = font.color.rgb[-6:] # 取最后 6 位
styles.append(f"color: #{rgb};")
elif font and font.color and isinstance(font.color, str):
# 英文名直接映射
color_css = self._get_highlight_color(font.color)
styles.append(f"color: {color_css};")
# — 背景色 —
if fill and getattr(fill, 'patternType', None) not in (None, 'none') and getattr(fill, 'fgColor', None):
fg = fill.fgColor
if getattr(fg, 'type', None) == 'rgb' and fg.rgb:
rgb = fg.rgb[-6:]
styles.append(f"background-color: #{rgb};")
elif isinstance(fg, str):
bg_css = self._get_highlight_color(fg)
styles.append(f"background-color: {bg_css};")
# — 下划线 + 删除线 —
td = []
if font and getattr(font, 'underline', False):
td.append("underline")
if font and getattr(font, 'strike', False):
td.append("line-through")
if td:
styles.append(f"text-decoration: {' '.join(td)};")
# — 上标 / 下标 —
if font and getattr(font, 'vertAlign', None) == 'superscript':
styles.append("vertical-align: super; font-size: smaller;")
elif font and getattr(font, 'vertAlign', None) == 'subscript':
styles.append("vertical-align: sub; font-size: smaller;")
# — 加粗 / 斜体 —
if font and font.bold:
styles.append("font-weight: bold;")
if font and font.italic:
styles.append("font-style: italic;")
# — 字体 & 大小 —
if font and font.name:
styles.append(f"font-family: '{font.name}';")
if font and font.size:
styles.append(f"font-size: {font.size}pt;")
elif mode == 'xls' and xls_book is not None:
# ============ xlrd ============
# cell 必须带 .rowx, .colx, .sheet 属性
rowx, colx = getattr(cell, 'rowx', None), getattr(cell, 'colx', None)
if rowx is not None and colx is not None:
sheet = cell.sheet
xf = xls_book.xf_list[sheet.cell_xf_index(rowx, colx)]
fnt = xls_book.font_list[xf.font_index]
# — 字体颜色(Index -> 英文名 -> hex)—
idx2name = {
0x08: 'black', 0x0A: 'white', 0x0C: 'red', 0x10: 'green',
0x14: 'blue', 0x18: 'yellow', 0x1C: 'magenta', 0x20: 'cyan'
}
name = idx2name.get(fnt.colour_index)
if name:
styles.append(f"color: {self._get_highlight_color(name)};")
# — 下划线 + 删除线 —
td = []
if getattr(fnt, 'underline_type', 0):
td.append("underline")
if getattr(fnt, 'strike_out', False):
td.append("line-through")
if td:
styles.append(f"text-decoration: {' '.join(td)};")
# — 加粗 / 斜体 —
if getattr(fnt, 'bold', False):
styles.append("font-weight: bold;")
if getattr(fnt, 'italic', False):
styles.append("font-style: italic;")
# — 字体 family —
if getattr(fnt, 'name', None):
styles.append(f"font-family: '{fnt.name}';")
# 注:xlrd 无法获取字号、fill、上下标
return ''.join(styles)
def xlsx_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
try:
import openpyxl
from openpyxl.utils import get_column_letter
wb = openpyxl.load_workbook(input_path, data_only=True)
style = ''''''
html = []
for idx, sheet in enumerate(wb.worksheets):
html.append(f'Sheet {idx+1}: {sheet.title}') # 计算列宽 col_widths = {} for col_idx in range(1, sheet.max_column + 1): col_letter = get_column_letter(col_idx) dim = sheet.column_dimensions.get(col_letter) if dim and dim.width and dim.width > 0: # openpyxl width 以字符数为单位,约 7px/字符 col_widths[col_idx] = max(60, int(dim.width * 7.5)) else: # 根据内容估算宽度 max_len = 8 for row_idx in range(1, min(sheet.max_row + 1, 50)): cell = sheet.cell(row=row_idx, column=col_idx) if cell.value is not None: max_len = max(max_len, len(str(cell.value))) col_widths[col_idx] = max(60, min(300, max_len * 9)) html.append('')
html.append(' ')
html_str = style + ''.join(html)
return self._save_html(f'{html_str}', output_path)
except Exception as e:
return f"转换失败: {str(e)}"
def xls_to_html(self, input_path: str, output_path: Optional[str] = None) -> str:
try:
import xlrd
xls = xlrd.open_workbook(input_path, formatting_info=True)
body = []
style = ''''''
for idx, sheet in enumerate(xls.sheets()):
body.append(f'
Sheet {idx+1}: {sheet.name}') body.append('')
body.append(' ')
body_content = '\n'.join(body)
html_body = f'{style}{body_content}'
return self._save_html(f'{html_body}', output_path)
except Exception as e:
return f"转换失败: {str(e)}"
# 使用示例
if __name__ == "__main__":
converter = FileConverter()
try:
# 示例转换
converter.txt_to_html("input.md", "output.html")
converter.txt_to_html("input.txt", "output.html")
converter.doc_to_html("input.doc", "output.html")
converter.docx_to_html("input.docx", "output.html")
converter.docx_to_html("input.xlsx", "output.html")
converter.docx_to_html("input.xls", "output.html")
except Exception as e:
print(f"转换错误: {str(e)}")
|
|---|
审阅批注:
') for comment_id, comment_info in active_comments.items(): author = comment_info["author"] date = comment_info["date"] text = comment_info["text"] html.append(f'