Files
gangyan/langchain-chat/pdf_convert_service/app.py

103 lines
3.3 KiB
Python
Raw Normal View History

"""
本地 PDF Markdown 微服务兼容 langchain-chat file_converter.pdf_to_html 的调用约定
POST /convert/
Body: {"pdf_path": "<相对于知识库根目录的路径>"}
成功: {"status": "success", "markdown_path": "<生成的 .md 绝对路径>"}
失败: {"status": "error", "message": "..."} + HTTP 4xx/5xx
环境变量:
PDF_CONVERT_KB_ROOT 知识库根目录须与 configs.kb_config.PDF_CONVERT_KB_ROOT 一致
PDF_CONVERT_PORT 监听端口默认 6006
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
import fitz # PyMuPDF
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import uvicorn
logger = logging.getLogger("pdf_convert_service")
logging.basicConfig(level=logging.INFO)
# 与 langchain-chat 默认知识库目录一致(本文件位于 langchain-chat/pdf_convert_service/app.py
_LC_ROOT = Path(__file__).resolve().parent.parent
_DEFAULT_KB = _LC_ROOT / "knowledge_base"
KB_ROOT = Path(os.environ.get("PDF_CONVERT_KB_ROOT", str(_DEFAULT_KB))).resolve()
app = FastAPI(title="PDF Convert Service", version="1.0.0")
class ConvertRequest(BaseModel):
pdf_path: str = Field(..., description="相对于知识库根的路径,如 kb_name/file.pdf")
def _under_kb_root(full: Path) -> bool:
try:
full.resolve().relative_to(KB_ROOT)
return True
except ValueError:
return False
def _pdf_to_markdown_file(pdf_abs: Path, md_abs: Path) -> None:
doc = fitz.open(pdf_abs)
try:
chunks: list[str] = []
for i in range(len(doc)):
page = doc.load_page(i)
text = page.get_text()
chunks.append(f"\n\n## 第 {i + 1}\n\n{text.strip()}\n")
body = "".join(chunks).strip()
if not body:
body = "_未能从 PDF 提取到文本可能是扫描件或加密文档_"
md_abs.parent.mkdir(parents=True, exist_ok=True)
md_abs.write_text(body, encoding="utf-8")
finally:
doc.close()
@app.get("/health")
def health():
return {"status": "ok", "kb_root": str(KB_ROOT)}
@app.post("/convert/")
@app.post("/convert")
def convert(req: ConvertRequest):
rel = req.pdf_path.replace("\\", "/").lstrip("/")
if ".." in rel.split("/"):
raise HTTPException(status_code=400, detail="invalid pdf_path")
pdf_abs = (KB_ROOT / rel).resolve()
if not _under_kb_root(pdf_abs):
raise HTTPException(status_code=403, detail="pdf_path outside knowledge base root")
if not pdf_abs.is_file():
raise HTTPException(status_code=404, detail=f"file not found: {pdf_abs}")
if pdf_abs.suffix.lower() != ".pdf":
raise HTTPException(status_code=400, detail="not a pdf file")
md_abs = pdf_abs.with_suffix(".md")
try:
_pdf_to_markdown_file(pdf_abs, md_abs)
except Exception as e:
logger.exception("pdf convert failed")
raise HTTPException(status_code=500, detail=str(e)) from e
return {"status": "success", "markdown_path": str(md_abs)}
def main():
port = int(os.environ.get("PDF_CONVERT_PORT", "6006"))
host = os.environ.get("PDF_CONVERT_HOST", "127.0.0.1")
logger.info("KB_ROOT=%s listen=%s:%s", KB_ROOT, host, port)
uvicorn.run(app, host=host, port=port, log_level="info")
if __name__ == "__main__":
main()