import zipfile from lxml import etree import json import os def extract_docx_info(docx_path: str, output_json: str = 'docx_info.json'): if not os.path.isfile(docx_path): raise FileNotFoundError(f"未找到 DOCX 文件:{docx_path}") tag_counts = {} rels_counts = {} media_files = [] with zipfile.ZipFile(docx_path) as docx: file_list = docx.namelist() # 统计 document.xml 中的元素标签 doc_xml = docx.read('word/document.xml') doc_tree = etree.fromstring(doc_xml) for elem in doc_tree.iter(): tag = etree.QName(elem).localname tag_counts[tag] = tag_counts.get(tag, 0) + 1 # 统计文档关系(document.xml.rels) rels_path = 'word/_rels/document.xml.rels' if rels_path in file_list: rels_xml = docx.read(rels_path) rels_tree = etree.fromstring(rels_xml) for rel in rels_tree.findall( './/{http://schemas.openxmlformats.org/package/2006/relationships}Relationship' ): rel_type = rel.get('Type').split('/')[-1] rels_counts[rel_type] = rels_counts.get(rel_type, 0) + 1 # 列出所有嵌入的媒体文件 media_files = [f for f in file_list if f.startswith('word/media/')] # 汇总信息 info = { 'source_docx': os.path.basename(docx_path), 'elements': tag_counts, 'relationships': rels_counts, 'media_files': media_files, } # 写入 JSON with open(output_json, 'w', encoding='utf-8') as f: json.dump(info, f, ensure_ascii=False, indent=2) print(f"已生成 JSON 文件:{output_json}") if __name__ == '__main__': # 读取当前目录下的 test.docx extract_docx_info('./test.docx', 'docx_info.json')