""" 本地 PDF 转 Markdown 微服务,兼容 langchain-chat 中 file_converter.pdf_to_html 的调用约定: POST /convert/ Body: {"pdf_path": "<相对于知识库根目录的路径>"} 成功: {"status": "success", "markdown_path": "<生成的 .md 绝对路径>"} 失败: {"status": "error", "message": "..."} + HTTP 4xx/5xx 环境变量: PDF_CONVERT_KB_ROOT 知识库根目录,须与 configs.kb_config.PDF_CONVERT_KB_ROOT 一致 PDF_CONVERT_PORT 监听端口,默认 6006 """ from __future__ import annotations import logging import os from pathlib import Path import fitz # PyMuPDF from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field import uvicorn logger = logging.getLogger("pdf_convert_service") logging.basicConfig(level=logging.INFO) # 与 langchain-chat 默认知识库目录一致(本文件位于 langchain-chat/pdf_convert_service/app.py) _LC_ROOT = Path(__file__).resolve().parent.parent _DEFAULT_KB = _LC_ROOT / "knowledge_base" KB_ROOT = Path(os.environ.get("PDF_CONVERT_KB_ROOT", str(_DEFAULT_KB))).resolve() app = FastAPI(title="PDF Convert Service", version="1.0.0") class ConvertRequest(BaseModel): pdf_path: str = Field(..., description="相对于知识库根的路径,如 kb_name/file.pdf") def _under_kb_root(full: Path) -> bool: try: full.resolve().relative_to(KB_ROOT) return True except ValueError: return False def _pdf_to_markdown_file(pdf_abs: Path, md_abs: Path) -> None: doc = fitz.open(pdf_abs) try: chunks: list[str] = [] for i in range(len(doc)): page = doc.load_page(i) text = page.get_text() chunks.append(f"\n\n## 第 {i + 1} 页\n\n{text.strip()}\n") body = "".join(chunks).strip() if not body: body = "_(未能从 PDF 提取到文本,可能是扫描件或加密文档)_" md_abs.parent.mkdir(parents=True, exist_ok=True) md_abs.write_text(body, encoding="utf-8") finally: doc.close() @app.get("/health") def health(): return {"status": "ok", "kb_root": str(KB_ROOT)} @app.post("/convert/") @app.post("/convert") def convert(req: ConvertRequest): rel = req.pdf_path.replace("\\", "/").lstrip("/") if ".." in rel.split("/"): raise HTTPException(status_code=400, detail="invalid pdf_path") pdf_abs = (KB_ROOT / rel).resolve() if not _under_kb_root(pdf_abs): raise HTTPException(status_code=403, detail="pdf_path outside knowledge base root") if not pdf_abs.is_file(): raise HTTPException(status_code=404, detail=f"file not found: {pdf_abs}") if pdf_abs.suffix.lower() != ".pdf": raise HTTPException(status_code=400, detail="not a pdf file") md_abs = pdf_abs.with_suffix(".md") try: _pdf_to_markdown_file(pdf_abs, md_abs) except Exception as e: logger.exception("pdf convert failed") raise HTTPException(status_code=500, detail=str(e)) from e return {"status": "success", "markdown_path": str(md_abs)} def main(): port = int(os.environ.get("PDF_CONVERT_PORT", "6006")) host = os.environ.get("PDF_CONVERT_HOST", "127.0.0.1") logger.info("KB_ROOT=%s listen=%s:%s", KB_ROOT, host, port) uvicorn.run(app, host=host, port=port, log_level="info") if __name__ == "__main__": main()