56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
|
|
import zipfile
|
|||
|
|
from lxml import etree
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
def extract_docx_info(docx_path: str, output_json: str = 'docx_info.json'):
|
|||
|
|
if not os.path.isfile(docx_path):
|
|||
|
|
raise FileNotFoundError(f"未找到 DOCX 文件:{docx_path}")
|
|||
|
|
|
|||
|
|
tag_counts = {}
|
|||
|
|
rels_counts = {}
|
|||
|
|
media_files = []
|
|||
|
|
|
|||
|
|
with zipfile.ZipFile(docx_path) as docx:
|
|||
|
|
file_list = docx.namelist()
|
|||
|
|
|
|||
|
|
# 统计 document.xml 中的元素标签
|
|||
|
|
doc_xml = docx.read('word/document.xml')
|
|||
|
|
doc_tree = etree.fromstring(doc_xml)
|
|||
|
|
for elem in doc_tree.iter():
|
|||
|
|
tag = etree.QName(elem).localname
|
|||
|
|
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
|||
|
|
|
|||
|
|
# 统计文档关系(document.xml.rels)
|
|||
|
|
rels_path = 'word/_rels/document.xml.rels'
|
|||
|
|
if rels_path in file_list:
|
|||
|
|
rels_xml = docx.read(rels_path)
|
|||
|
|
rels_tree = etree.fromstring(rels_xml)
|
|||
|
|
for rel in rels_tree.findall(
|
|||
|
|
'.//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
|
|||
|
|
):
|
|||
|
|
rel_type = rel.get('Type').split('/')[-1]
|
|||
|
|
rels_counts[rel_type] = rels_counts.get(rel_type, 0) + 1
|
|||
|
|
|
|||
|
|
# 列出所有嵌入的媒体文件
|
|||
|
|
media_files = [f for f in file_list if f.startswith('word/media/')]
|
|||
|
|
|
|||
|
|
# 汇总信息
|
|||
|
|
info = {
|
|||
|
|
'source_docx': os.path.basename(docx_path),
|
|||
|
|
'elements': tag_counts,
|
|||
|
|
'relationships': rels_counts,
|
|||
|
|
'media_files': media_files,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 写入 JSON
|
|||
|
|
with open(output_json, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(info, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"已生成 JSON 文件:{output_json}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 读取当前目录下的 test.docx
|
|||
|
|
extract_docx_info('./test.docx', 'docx_info.json')
|