gangyan/langchain-chat/tests/docx_parser.py

import zipfile
from lxml import etree
import json
import os

def extract_docx_info(docx_path: str, output_json: str = 'docx_info.json'):
    if not os.path.isfile(docx_path):
        raise FileNotFoundError(f"未找到 DOCX 文件：{docx_path}")

    tag_counts = {}
    rels_counts = {}
    media_files = []

    with zipfile.ZipFile(docx_path) as docx:
        file_list = docx.namelist()

        # 统计 document.xml 中的元素标签
        doc_xml = docx.read('word/document.xml')
        doc_tree = etree.fromstring(doc_xml)
        for elem in doc_tree.iter():
            tag = etree.QName(elem).localname
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

        # 统计文档关系（document.xml.rels）
        rels_path = 'word/_rels/document.xml.rels'
        if rels_path in file_list:
            rels_xml = docx.read(rels_path)
            rels_tree = etree.fromstring(rels_xml)
            for rel in rels_tree.findall(
                './/{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
            ):
                rel_type = rel.get('Type').split('/')[-1]
                rels_counts[rel_type] = rels_counts.get(rel_type, 0) + 1

        # 列出所有嵌入的媒体文件
        media_files = [f for f in file_list if f.startswith('word/media/')]

    # 汇总信息
    info = {
        'source_docx': os.path.basename(docx_path),
        'elements': tag_counts,
        'relationships': rels_counts,
        'media_files': media_files,
    }

    # 写入 JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(info, f, ensure_ascii=False, indent=2)

    print(f"已生成 JSON 文件：{output_json}")


if __name__ == '__main__':
    # 读取当前目录下的 test.docx
    extract_docx_info('./test.docx', 'docx_info.json')