Files
gangyan/langchain-chat/document_loaders/myexcelloader.py

72 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
from pathlib import Path
import logging
from typing import List, Optional, Union, Dict, Any
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class ExcelLoader(BaseLoader):
"""
用于加载 Excel 文件(.xls/.xlsx的 Loader。
使用 pandas 解析所有工作表,并将非空单元格内容展平为按逗号分隔的文段。
"""
def __init__(
self,
file_path: Union[str, Path],
*,
metadata: Optional[Dict[str, Any]] = None
):
"""
Args:
file_path: Excel 文件路径,支持 .xls 和 .xlsx
metadata: 附加的文档级元数据
"""
self.file_path = str(file_path)
self.metadata = metadata or {}
suffix = Path(self.file_path).suffix.lower()
if suffix not in (".xls", ".xlsx"):
raise ValueError(f"ExcelLoader 仅支持 .xls/.xlsx 文件: {self.file_path}")
if not os.path.isfile(self.file_path):
raise FileNotFoundError(f"文件不存在: {self.file_path}")
def load(self) -> List[Document]:
"""
读取 Excel 中的所有工作表,返回每个表格中所有非空单元格按逗号分隔的 Document 列表。
"""
try:
# sheet_name=None 返回 dict: {sheet_name: DataFrame}
sheets: Dict[str, pd.DataFrame] = pd.read_excel(
self.file_path,
sheet_name=None
)
except Exception as e:
logger.error(f"读取 Excel 文件失败: {self.file_path}", exc_info=True)
raise RuntimeError(f"无法加载 Excel 文件: {e}") from e
documents: List[Document] = []
for sheet_name, df in sheets.items():
segments: List[str] = []
# 遍历所有单元格
for row in df.itertuples(index=False, name=None):
for cell in row:
if pd.isna(cell):
continue
text = str(cell).strip()
if text:
segments.append(text)
# 用英文逗号分隔所有文段
content = ",".join(segments)
md: Dict[str, Any] = {
"source": self.file_path,
"sheet_name": sheet_name,
**self.metadata
}
documents.append(Document(page_content=content, metadata=md))
return documents