[前端+RAG] 修复PDF文字重叠;上传异步化(LLM+向量化后台执行);摘要关键词模型改为deepseek-v3
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
<template>
|
||||
<div class="pdf-viewer" ref="containerRef">
|
||||
<div v-for="page in pages" :key="page" class="pdf-page-wrapper">
|
||||
<div class="pdf-page" :id="'pdf-page-' + page" :style="{ position: 'relative' }">
|
||||
<div class="pdf-page" :style="{ width: pageWidths[page] + 'px', height: pageHeights[page] + 'px' }">
|
||||
<canvas :ref="el => setCanvasRef(el, page)"></canvas>
|
||||
<div class="text-layer" :ref="el => setTextLayerRef(el, page)"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div v-if="loading" class="pdf-loading">加载中...</div>
|
||||
<div v-if="loading" class="pdf-loading">PDF 加载中...</div>
|
||||
<div v-if="error" class="pdf-error">{{ error }}</div>
|
||||
</div>
|
||||
</template>
|
||||
@@ -14,8 +14,8 @@
|
||||
<script setup lang="ts">
|
||||
import { ref, onMounted, onBeforeUnmount, nextTick, watch } from 'vue';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import { renderTextLayer } from 'pdfjs-dist';
|
||||
|
||||
// Set worker
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = new URL(
|
||||
'pdfjs-dist/build/pdf.worker.min.js',
|
||||
import.meta.url
|
||||
@@ -28,18 +28,16 @@ const props = defineProps<{
|
||||
|
||||
const containerRef = ref<HTMLElement | null>(null);
|
||||
const pages = ref<number[]>([]);
|
||||
const pageWidths = ref<Record<number, number>>({});
|
||||
const pageHeights = ref<Record<number, number>>({});
|
||||
const loading = ref(true);
|
||||
const error = ref('');
|
||||
const canvasRefs: Record<number, HTMLCanvasElement> = {};
|
||||
const textLayerRefs: Record<number, HTMLElement> = {};
|
||||
let pdfDoc: any = null;
|
||||
|
||||
const setCanvasRef = (el: any, page: number) => {
|
||||
if (el) canvasRefs[page] = el;
|
||||
};
|
||||
const setTextLayerRef = (el: any, page: number) => {
|
||||
if (el) textLayerRefs[page] = el;
|
||||
};
|
||||
const setCanvasRef = (el: any, page: number) => { if (el) canvasRefs[page] = el; };
|
||||
const setTextLayerRef = (el: any, page: number) => { if (el) textLayerRefs[page] = el; };
|
||||
|
||||
const renderPage = async (pageNum: number) => {
|
||||
if (!pdfDoc) return;
|
||||
@@ -47,43 +45,28 @@ const renderPage = async (pageNum: number) => {
|
||||
const scale = props.scale || 1.5;
|
||||
const viewport = page.getViewport({ scale });
|
||||
|
||||
// Canvas rendering
|
||||
const canvas = canvasRefs[pageNum];
|
||||
if (!canvas) return;
|
||||
const context = canvas.getContext('2d');
|
||||
canvas.height = viewport.height;
|
||||
canvas.width = viewport.width;
|
||||
pageWidths.value[pageNum] = viewport.width;
|
||||
pageHeights.value[pageNum] = viewport.height;
|
||||
|
||||
await page.render({ canvasContext: context, viewport }).promise;
|
||||
|
||||
// Text layer for text selection
|
||||
// Text layer using pdfjs built-in API
|
||||
const textLayerDiv = textLayerRefs[pageNum];
|
||||
if (textLayerDiv) {
|
||||
textLayerDiv.style.width = viewport.width + 'px';
|
||||
textLayerDiv.style.height = viewport.height + 'px';
|
||||
textLayerDiv.innerHTML = '';
|
||||
|
||||
const textContent = await page.getTextContent();
|
||||
const textItems = textContent.items;
|
||||
|
||||
for (const item of textItems) {
|
||||
if (!item.str) continue;
|
||||
const tx = pdfjsLib.Util.transform(viewport.transform, item.transform);
|
||||
const span = document.createElement('span');
|
||||
span.textContent = item.str;
|
||||
span.style.position = 'absolute';
|
||||
span.style.left = tx[4] + 'px';
|
||||
span.style.top = (viewport.height - tx[5]) + 'px';
|
||||
span.style.fontSize = Math.abs(tx[0]) + 'px';
|
||||
span.style.fontFamily = item.fontName || 'sans-serif';
|
||||
span.style.transformOrigin = '0% 0%';
|
||||
// Width matching
|
||||
if (item.width) {
|
||||
const textWidth = item.width * scale;
|
||||
span.style.width = textWidth + 'px';
|
||||
span.style.display = 'inline-block';
|
||||
}
|
||||
textLayerDiv.appendChild(span);
|
||||
}
|
||||
renderTextLayer({
|
||||
textContentSource: textContent,
|
||||
container: textLayerDiv,
|
||||
viewport: viewport,
|
||||
textDivs: []
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -97,33 +80,20 @@ const loadPdf = async () => {
|
||||
pdfDoc = await loadingTask.promise;
|
||||
const numPages = pdfDoc.numPages;
|
||||
pages.value = Array.from({ length: numPages }, (_, i) => i + 1);
|
||||
|
||||
await nextTick();
|
||||
for (let i = 1; i <= numPages; i++) {
|
||||
await renderPage(i);
|
||||
}
|
||||
} catch (e: any) {
|
||||
error.value = 'PDF 加载失败: ' + (e.message || e);
|
||||
console.error('PDF load error:', e);
|
||||
} finally {
|
||||
loading.value = false;
|
||||
}
|
||||
};
|
||||
|
||||
watch(() => props.src, () => {
|
||||
if (props.src) loadPdf();
|
||||
});
|
||||
|
||||
onMounted(() => {
|
||||
if (props.src) loadPdf();
|
||||
});
|
||||
|
||||
onBeforeUnmount(() => {
|
||||
if (pdfDoc) {
|
||||
pdfDoc.destroy();
|
||||
pdfDoc = null;
|
||||
}
|
||||
});
|
||||
watch(() => props.src, () => { if (props.src) loadPdf(); });
|
||||
onMounted(() => { if (props.src) loadPdf(); });
|
||||
onBeforeUnmount(() => { if (pdfDoc) { pdfDoc.destroy(); pdfDoc = null; } });
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
@@ -131,54 +101,51 @@ onBeforeUnmount(() => {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
overflow-y: auto;
|
||||
background: #f5f5f5;
|
||||
background: #e8e8e8;
|
||||
padding: 16px 0;
|
||||
}
|
||||
|
||||
.pdf-page-wrapper {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
margin-bottom: 16px;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.pdf-page {
|
||||
position: relative;
|
||||
background: white;
|
||||
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
.pdf-page canvas {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.text-layer {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
overflow: hidden;
|
||||
opacity: 0.3;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
.text-layer span {
|
||||
/* pdfjs renderTextLayer creates spans with absolute positioning */
|
||||
.text-layer :deep(span) {
|
||||
color: transparent;
|
||||
position: absolute;
|
||||
white-space: pre;
|
||||
cursor: text;
|
||||
transform-origin: 0% 0%;
|
||||
}
|
||||
|
||||
.text-layer span::selection {
|
||||
.text-layer :deep(span::selection) {
|
||||
background: rgba(0, 78, 160, 0.3);
|
||||
color: transparent;
|
||||
}
|
||||
|
||||
.text-layer :deep(br) {
|
||||
display: none;
|
||||
}
|
||||
.pdf-loading, .pdf-error {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #999;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.pdf-error {
|
||||
color: #c00;
|
||||
}
|
||||
.pdf-error { color: #c00; }
|
||||
</style>
|
||||
|
||||
@@ -269,6 +269,62 @@ def upload_docs(
|
||||
return BaseResponse(code=200, msg="文件上传与向量化完成", data={"failed_files": failed_files})
|
||||
|
||||
|
||||
def _background_llm_and_vectorize(
|
||||
knowledge_base_name: str,
|
||||
file_names: List[str],
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
zh_title_enhance: bool,
|
||||
docs: dict,
|
||||
not_refresh_vs_cache: bool,
|
||||
):
|
||||
"""后台线程:执行 LLM 导读生成 + 向量化,不阻塞上传响应。"""
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
||||
|
||||
# 1. 生成 LLM 导读(摘要、关键词、章节速览)
|
||||
for filename in file_names:
|
||||
try:
|
||||
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
|
||||
new_loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(new_loop)
|
||||
try:
|
||||
llm_result = new_loop.run_until_complete(knowledge_file.get_llm_result())
|
||||
finally:
|
||||
new_loop.close()
|
||||
|
||||
# 将 LLM 结果写入缓存文件,供 Java 后端轮询读取
|
||||
import json
|
||||
cache_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "knowledge_base", knowledge_base_name)
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
cache_file = os.path.join(cache_dir, f"{filename}.llm_result.json")
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(llm_result, f, ensure_ascii=False)
|
||||
logger.info(f"[后台] LLM 导读生成完成: {filename}")
|
||||
except Exception as e:
|
||||
logger.error(f"[后台] LLM 导读生成失败 {filename}: {e}")
|
||||
|
||||
# 2. 向量化
|
||||
try:
|
||||
_update_docs_impl(
|
||||
knowledge_base_name=knowledge_base_name,
|
||||
file_names=file_names,
|
||||
override_custom_docs=True,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance,
|
||||
docs=docs,
|
||||
not_refresh_vs_cache=True,
|
||||
)
|
||||
if kb and not not_refresh_vs_cache:
|
||||
kb.save_vector_store()
|
||||
logger.info(f"[后台] 向量化完成,总耗时: {time.time() - start_time:.2f}s")
|
||||
except Exception as e:
|
||||
logger.error(f"[后台] 向量化失败: {e}")
|
||||
|
||||
|
||||
def upload_docs_new(
|
||||
files: List[UploadFile] = File(..., description="上传文件,支持多文件"),
|
||||
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
|
||||
@@ -282,16 +338,15 @@ def upload_docs_new(
|
||||
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"),
|
||||
) -> BaseResponse:
|
||||
"""
|
||||
API接口:上传文件,并/或向量化
|
||||
API接口:上传文件,先提取全文快速返回,LLM导读+向量化后台异步执行
|
||||
"""
|
||||
import time # 添加计时模块
|
||||
import time
|
||||
start_time = time.time()
|
||||
if not validate_kb_name(knowledge_base_name):
|
||||
return BaseResponse(code=403, msg="Don't attack me")
|
||||
|
||||
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
||||
if kb is None:
|
||||
# 自动创建知识库
|
||||
kb = KBServiceFactory.get_service(knowledge_base_name, DEFAULT_VS_TYPE, EMBEDDING_MODEL)
|
||||
try:
|
||||
kb.create_kb()
|
||||
@@ -303,68 +358,52 @@ def upload_docs_new(
|
||||
|
||||
failed_files = {}
|
||||
file_names = list(docs.keys())
|
||||
|
||||
# 生成摘要、关键词、章节速览的结果存储
|
||||
llm_results = {}
|
||||
|
||||
# 先将上传的文件保存到磁盘
|
||||
# 保存文件到磁盘 + 提取全文(快速操作)
|
||||
for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override):
|
||||
filename = result["data"]["file_name"]
|
||||
if result["code"] != 200:
|
||||
failed_files[filename] = result["msg"]
|
||||
|
||||
if filename not in file_names:
|
||||
file_names.append(filename)
|
||||
|
||||
# 针对成功上传的文件,生成摘要、关键词、章节速览
|
||||
# 仅提取全文(快速),不调用 LLM
|
||||
try:
|
||||
knowledge_file = KnowledgeFile(filename=filename, knowledge_base_name=knowledge_base_name)
|
||||
# 使用线程池运行异步函数,避免事件循环冲突
|
||||
import concurrent.futures
|
||||
def run_async_in_thread():
|
||||
new_loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(new_loop)
|
||||
full_text_data = knowledge_file.get_full_text()
|
||||
import json as _json
|
||||
try:
|
||||
return new_loop.run_until_complete(knowledge_file.get_llm_result())
|
||||
finally:
|
||||
new_loop.close()
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||
future = executor.submit(run_async_in_thread)
|
||||
llm_result = future.result()
|
||||
full_text = _json.loads(full_text_data).get("full_text", "")
|
||||
except:
|
||||
full_text = ""
|
||||
llm_results[filename] = {
|
||||
"full_text": llm_result.get("full_text", "获取全文失败"),
|
||||
"article_abstract": llm_result.get("article_abstract", "生成摘要失败"),
|
||||
"article_keywords": llm_result.get("article_keywords", "生成关键词失败"),
|
||||
"article_paragraph": llm_result.get("article_paragraph", "生成章节速览失败")
|
||||
"full_text": full_text,
|
||||
"article_abstract": "导读生成中...",
|
||||
"article_keywords": "导读生成中...",
|
||||
"article_paragraph": "导读生成中..."
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"生成LLM结果时出错:{e}", exc_info=e if log_verbose else None)
|
||||
logger.error(f"提取全文失败 {filename}: {e}")
|
||||
llm_results[filename] = {
|
||||
"article_abstract": "生成摘要失败",
|
||||
"article_keywords": "生成关键词失败",
|
||||
"article_paragraph": "生成章节速览失败"
|
||||
"full_text": "",
|
||||
"article_abstract": "导读生成中...",
|
||||
"article_keywords": "导读生成中...",
|
||||
"article_paragraph": "导读生成中..."
|
||||
}
|
||||
|
||||
# 对保存的文件进行向量化
|
||||
if to_vector_store:
|
||||
update_st = time.time()
|
||||
result = _update_docs_impl(
|
||||
knowledge_base_name=knowledge_base_name,
|
||||
file_names=file_names,
|
||||
override_custom_docs=True,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance,
|
||||
docs=docs,
|
||||
not_refresh_vs_cache=True,
|
||||
# 后台异步执行 LLM 导读 + 向量化(不阻塞响应)
|
||||
import threading
|
||||
bg_thread = threading.Thread(
|
||||
target=_background_llm_and_vectorize,
|
||||
args=(knowledge_base_name, file_names, chunk_size, chunk_overlap,
|
||||
zh_title_enhance, docs, not_refresh_vs_cache),
|
||||
daemon=True
|
||||
)
|
||||
failed_files.update(result.data["failed_files"])
|
||||
if not not_refresh_vs_cache:
|
||||
kb.save_vector_store()
|
||||
logger.info(f'向量化用时:{time.time() - update_st}')
|
||||
logger.info(f"总执行时间: {time.time() - start_time:.2f}s")
|
||||
return BaseResponse(code=200, msg="文件上传与向量化完成", data={
|
||||
bg_thread.start()
|
||||
|
||||
logger.info(f"文件上传+全文提取用时: {time.time() - start_time:.2f}s,LLM+向量化已转后台")
|
||||
return BaseResponse(code=200, msg="文件上传完成,导读生成中", data={
|
||||
"failed_files": failed_files,
|
||||
"llm_results": llm_results
|
||||
})
|
||||
|
||||
@@ -390,7 +390,7 @@ class KnowledgeFile:
|
||||
llm_time = time.time()
|
||||
abstract_task = get_llm_model_response_async(
|
||||
strategy_name="gen_abstract",
|
||||
llm_model_name=LLM_MODELS[1],
|
||||
llm_model_name=LLM_MODELS[0],
|
||||
template_prompt_name="gen_abstract",
|
||||
prompt_param_dict={"context": full_text},
|
||||
temperature=0.7,
|
||||
@@ -399,7 +399,7 @@ class KnowledgeFile:
|
||||
|
||||
keywords_task = get_llm_model_response_async(
|
||||
strategy_name="gen_keywords",
|
||||
llm_model_name=LLM_MODELS[1],
|
||||
llm_model_name=LLM_MODELS[0],
|
||||
template_prompt_name="gen_keywords",
|
||||
prompt_param_dict={"context": full_text},
|
||||
temperature=0.7,
|
||||
|
||||
Reference in New Issue
Block a user