Files
gangyan/langchain-chat/pdf_convert_service/app.py

103 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
本地 PDF 转 Markdown 微服务,兼容 langchain-chat 中 file_converter.pdf_to_html 的调用约定:
POST /convert/
Body: {"pdf_path": "<相对于知识库根目录的路径>"}
成功: {"status": "success", "markdown_path": "<生成的 .md 绝对路径>"}
失败: {"status": "error", "message": "..."} + HTTP 4xx/5xx
环境变量:
PDF_CONVERT_KB_ROOT 知识库根目录,须与 configs.kb_config.PDF_CONVERT_KB_ROOT 一致
PDF_CONVERT_PORT 监听端口,默认 6006
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
import fitz # PyMuPDF
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import uvicorn
logger = logging.getLogger("pdf_convert_service")
logging.basicConfig(level=logging.INFO)
# 与 langchain-chat 默认知识库目录一致(本文件位于 langchain-chat/pdf_convert_service/app.py
_LC_ROOT = Path(__file__).resolve().parent.parent
_DEFAULT_KB = _LC_ROOT / "knowledge_base"
KB_ROOT = Path(os.environ.get("PDF_CONVERT_KB_ROOT", str(_DEFAULT_KB))).resolve()
app = FastAPI(title="PDF Convert Service", version="1.0.0")
class ConvertRequest(BaseModel):
pdf_path: str = Field(..., description="相对于知识库根的路径,如 kb_name/file.pdf")
def _under_kb_root(full: Path) -> bool:
try:
full.resolve().relative_to(KB_ROOT)
return True
except ValueError:
return False
def _pdf_to_markdown_file(pdf_abs: Path, md_abs: Path) -> None:
doc = fitz.open(pdf_abs)
try:
chunks: list[str] = []
for i in range(len(doc)):
page = doc.load_page(i)
text = page.get_text()
chunks.append(f"\n\n## 第 {i + 1}\n\n{text.strip()}\n")
body = "".join(chunks).strip()
if not body:
body = "_未能从 PDF 提取到文本可能是扫描件或加密文档_"
md_abs.parent.mkdir(parents=True, exist_ok=True)
md_abs.write_text(body, encoding="utf-8")
finally:
doc.close()
@app.get("/health")
def health():
return {"status": "ok", "kb_root": str(KB_ROOT)}
@app.post("/convert/")
@app.post("/convert")
def convert(req: ConvertRequest):
rel = req.pdf_path.replace("\\", "/").lstrip("/")
if ".." in rel.split("/"):
raise HTTPException(status_code=400, detail="invalid pdf_path")
pdf_abs = (KB_ROOT / rel).resolve()
if not _under_kb_root(pdf_abs):
raise HTTPException(status_code=403, detail="pdf_path outside knowledge base root")
if not pdf_abs.is_file():
raise HTTPException(status_code=404, detail=f"file not found: {pdf_abs}")
if pdf_abs.suffix.lower() != ".pdf":
raise HTTPException(status_code=400, detail="not a pdf file")
md_abs = pdf_abs.with_suffix(".md")
try:
_pdf_to_markdown_file(pdf_abs, md_abs)
except Exception as e:
logger.exception("pdf convert failed")
raise HTTPException(status_code=500, detail=str(e)) from e
return {"status": "success", "markdown_path": str(md_abs)}
def main():
port = int(os.environ.get("PDF_CONVERT_PORT", "6006"))
host = os.environ.get("PDF_CONVERT_HOST", "127.0.0.1")
logger.info("KB_ROOT=%s listen=%s:%s", KB_ROOT, host, port)
uvicorn.run(app, host=host, port=port, log_level="info")
if __name__ == "__main__":
main()