[全量] 初始化项目代码、配置、文档及Agent协同harness

2026-04-02 11:36:05 +08:00
parent 0553309cdf
commit 87e571d9ec
1133 changed files with 221948 additions and 0 deletions
--- a/langchain-chat/tests/docx_parser.py
+++ b/langchain-chat/tests/docx_parser.py
@@ -0,0 +1,55 @@
+import zipfile
+from lxml import etree
+import json
+import os
+
+def extract_docx_info(docx_path: str, output_json: str = 'docx_info.json'):
+    if not os.path.isfile(docx_path):
+        raise FileNotFoundError(f"未找到 DOCX 文件：{docx_path}")
+
+    tag_counts = {}
+    rels_counts = {}
+    media_files = []
+
+    with zipfile.ZipFile(docx_path) as docx:
+        file_list = docx.namelist()
+
+        # 统计 document.xml 中的元素标签
+        doc_xml = docx.read('word/document.xml')
+        doc_tree = etree.fromstring(doc_xml)
+        for elem in doc_tree.iter():
+            tag = etree.QName(elem).localname
+            tag_counts[tag] = tag_counts.get(tag, 0) + 1
+
+        # 统计文档关系（document.xml.rels）
+        rels_path = 'word/_rels/document.xml.rels'
+        if rels_path in file_list:
+            rels_xml = docx.read(rels_path)
+            rels_tree = etree.fromstring(rels_xml)
+            for rel in rels_tree.findall(
+                './/{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
+            ):
+                rel_type = rel.get('Type').split('/')[-1]
+                rels_counts[rel_type] = rels_counts.get(rel_type, 0) + 1
+
+        # 列出所有嵌入的媒体文件
+        media_files = [f for f in file_list if f.startswith('word/media/')]
+
+    # 汇总信息
+    info = {
+        'source_docx': os.path.basename(docx_path),
+        'elements': tag_counts,
+        'relationships': rels_counts,
+        'media_files': media_files,
+    }
+
+    # 写入 JSON
+    with open(output_json, 'w', encoding='utf-8') as f:
+        json.dump(info, f, ensure_ascii=False, indent=2)
+
+    print(f"已生成 JSON 文件：{output_json}")
+
+
+if __name__ == '__main__':
+    # 读取当前目录下的 test.docx
+    extract_docx_info('./test.docx', 'docx_info.json')