103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
|
|
"""
|
|||
|
|
本地 PDF 转 Markdown 微服务,兼容 langchain-chat 中 file_converter.pdf_to_html 的调用约定:
|
|||
|
|
|
|||
|
|
POST /convert/
|
|||
|
|
Body: {"pdf_path": "<相对于知识库根目录的路径>"}
|
|||
|
|
成功: {"status": "success", "markdown_path": "<生成的 .md 绝对路径>"}
|
|||
|
|
失败: {"status": "error", "message": "..."} + HTTP 4xx/5xx
|
|||
|
|
|
|||
|
|
环境变量:
|
|||
|
|
PDF_CONVERT_KB_ROOT 知识库根目录,须与 configs.kb_config.PDF_CONVERT_KB_ROOT 一致
|
|||
|
|
PDF_CONVERT_PORT 监听端口,默认 6006
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
import fitz # PyMuPDF
|
|||
|
|
from fastapi import FastAPI, HTTPException
|
|||
|
|
from pydantic import BaseModel, Field
|
|||
|
|
import uvicorn
|
|||
|
|
|
|||
|
|
logger = logging.getLogger("pdf_convert_service")
|
|||
|
|
logging.basicConfig(level=logging.INFO)
|
|||
|
|
|
|||
|
|
# 与 langchain-chat 默认知识库目录一致(本文件位于 langchain-chat/pdf_convert_service/app.py)
|
|||
|
|
_LC_ROOT = Path(__file__).resolve().parent.parent
|
|||
|
|
_DEFAULT_KB = _LC_ROOT / "knowledge_base"
|
|||
|
|
KB_ROOT = Path(os.environ.get("PDF_CONVERT_KB_ROOT", str(_DEFAULT_KB))).resolve()
|
|||
|
|
|
|||
|
|
app = FastAPI(title="PDF Convert Service", version="1.0.0")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ConvertRequest(BaseModel):
|
|||
|
|
pdf_path: str = Field(..., description="相对于知识库根的路径,如 kb_name/file.pdf")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _under_kb_root(full: Path) -> bool:
|
|||
|
|
try:
|
|||
|
|
full.resolve().relative_to(KB_ROOT)
|
|||
|
|
return True
|
|||
|
|
except ValueError:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _pdf_to_markdown_file(pdf_abs: Path, md_abs: Path) -> None:
|
|||
|
|
doc = fitz.open(pdf_abs)
|
|||
|
|
try:
|
|||
|
|
chunks: list[str] = []
|
|||
|
|
for i in range(len(doc)):
|
|||
|
|
page = doc.load_page(i)
|
|||
|
|
text = page.get_text()
|
|||
|
|
chunks.append(f"\n\n## 第 {i + 1} 页\n\n{text.strip()}\n")
|
|||
|
|
body = "".join(chunks).strip()
|
|||
|
|
if not body:
|
|||
|
|
body = "_(未能从 PDF 提取到文本,可能是扫描件或加密文档)_"
|
|||
|
|
md_abs.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
md_abs.write_text(body, encoding="utf-8")
|
|||
|
|
finally:
|
|||
|
|
doc.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
@app.get("/health")
|
|||
|
|
def health():
|
|||
|
|
return {"status": "ok", "kb_root": str(KB_ROOT)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@app.post("/convert/")
|
|||
|
|
@app.post("/convert")
|
|||
|
|
def convert(req: ConvertRequest):
|
|||
|
|
rel = req.pdf_path.replace("\\", "/").lstrip("/")
|
|||
|
|
if ".." in rel.split("/"):
|
|||
|
|
raise HTTPException(status_code=400, detail="invalid pdf_path")
|
|||
|
|
|
|||
|
|
pdf_abs = (KB_ROOT / rel).resolve()
|
|||
|
|
if not _under_kb_root(pdf_abs):
|
|||
|
|
raise HTTPException(status_code=403, detail="pdf_path outside knowledge base root")
|
|||
|
|
if not pdf_abs.is_file():
|
|||
|
|
raise HTTPException(status_code=404, detail=f"file not found: {pdf_abs}")
|
|||
|
|
if pdf_abs.suffix.lower() != ".pdf":
|
|||
|
|
raise HTTPException(status_code=400, detail="not a pdf file")
|
|||
|
|
|
|||
|
|
md_abs = pdf_abs.with_suffix(".md")
|
|||
|
|
try:
|
|||
|
|
_pdf_to_markdown_file(pdf_abs, md_abs)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception("pdf convert failed")
|
|||
|
|
raise HTTPException(status_code=500, detail=str(e)) from e
|
|||
|
|
|
|||
|
|
return {"status": "success", "markdown_path": str(md_abs)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
port = int(os.environ.get("PDF_CONVERT_PORT", "6006"))
|
|||
|
|
host = os.environ.get("PDF_CONVERT_HOST", "127.0.0.1")
|
|||
|
|
logger.info("KB_ROOT=%s listen=%s:%s", KB_ROOT, host, port)
|
|||
|
|
uvicorn.run(app, host=host, port=port, log_level="info")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|