Files
gangyan/langchain-chat/tests/docx_parser.py

56 lines
1.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import zipfile
from lxml import etree
import json
import os
def extract_docx_info(docx_path: str, output_json: str = 'docx_info.json'):
if not os.path.isfile(docx_path):
raise FileNotFoundError(f"未找到 DOCX 文件:{docx_path}")
tag_counts = {}
rels_counts = {}
media_files = []
with zipfile.ZipFile(docx_path) as docx:
file_list = docx.namelist()
# 统计 document.xml 中的元素标签
doc_xml = docx.read('word/document.xml')
doc_tree = etree.fromstring(doc_xml)
for elem in doc_tree.iter():
tag = etree.QName(elem).localname
tag_counts[tag] = tag_counts.get(tag, 0) + 1
# 统计文档关系document.xml.rels
rels_path = 'word/_rels/document.xml.rels'
if rels_path in file_list:
rels_xml = docx.read(rels_path)
rels_tree = etree.fromstring(rels_xml)
for rel in rels_tree.findall(
'.//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
):
rel_type = rel.get('Type').split('/')[-1]
rels_counts[rel_type] = rels_counts.get(rel_type, 0) + 1
# 列出所有嵌入的媒体文件
media_files = [f for f in file_list if f.startswith('word/media/')]
# 汇总信息
info = {
'source_docx': os.path.basename(docx_path),
'elements': tag_counts,
'relationships': rels_counts,
'media_files': media_files,
}
# 写入 JSON
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=2)
print(f"已生成 JSON 文件:{output_json}")
if __name__ == '__main__':
# 读取当前目录下的 test.docx
extract_docx_info('./test.docx', 'docx_info.json')