import asyncio
import time
from langchain.chains import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from configs import KB_PROMPT, LLM_PROMPT, logger
from configs.prompt_config import AGENT_PROMPT, AGENT_WRITE_PROMPT, COMPARISON
from server.chat.utils import History
from server.utils import get_prompt_template, get_strategy_prompt_template, get_ChatOpenAI
import openai
from typing import Any, AsyncGenerator
from langchain.schema import HumanMessage
MAX_RETRIES = 2
RETRY_DELAY = 1
MAX_MAX_TOKENS = 8192 # 阿里云 DashScope API 限制
def get_llm_model_response(
strategy_name: str,
llm_model_name: str,
template_prompt_name: str,
prompt_param_dict: dict,
temperature: float,
max_tokens: int,
**kwargs: Any,
) -> str:
'''调用大模型,实现不同策略'''
# 校验 max_tokens 不超过 API 限制
if max_tokens is not None and max_tokens > MAX_MAX_TOKENS:
logger.warning(f"max_tokens({max_tokens}) 超过 API 限制,已调整为 {MAX_MAX_TOKENS}")
max_tokens = MAX_MAX_TOKENS
# 读取指定的大模型,这里不能加入callback,否则会把这部分模型响应加入最终的回答
# 同步调用关闭 streaming,避免流式传输错误
model = get_ChatOpenAI(
model_name=llm_model_name,
temperature=temperature,
max_tokens=max_tokens,
callbacks=[],
# streaming=False,
**kwargs
)
# 获取prompt
if template_prompt_name in KB_PROMPT:
prompt_template = get_prompt_template("knowledge_base_chat", template_prompt_name)
elif template_prompt_name in LLM_PROMPT:
prompt_template = get_prompt_template("llm_chat", template_prompt_name)
elif template_prompt_name in COMPARISON:
prompt_template = get_prompt_template("comparison_chat", template_prompt_name)
# 此处仅对全新agent流程的模板提示词奏效如果添加其他请注意是否冲突
elif template_prompt_name in AGENT_PROMPT:
if not template_prompt_name == "Think Test Bak" and not template_prompt_name == "get_next_tip":
prompt_template1 = get_prompt_template("agent_chat", "Think Test Bak")
prompt_template2 = get_prompt_template("agent_chat", template_prompt_name)
prompt_template = f"{prompt_template1}{prompt_template2}"
else:
prompt_template = get_prompt_template("agent_chat", template_prompt_name)
elif template_prompt_name in AGENT_WRITE_PROMPT:
if not template_prompt_name == "Write Test Bak" and not template_prompt_name == "get_next_write_tip":
prompt_template1 = get_prompt_template("agent_chat", "Write Test Bak")
prompt_template2 = get_prompt_template("agent_chat", template_prompt_name)
prompt_template = f"{prompt_template1}{prompt_template2}"
else:
prompt_template = get_prompt_template("agent_chat", template_prompt_name)
else:
prompt_template = get_strategy_prompt_template("knowledge_base_chat", template_prompt_name)
input_msg = History(role="system", content=prompt_template).to_msg_template(False)
prompt = ChatPromptTemplate.from_messages([input_msg])
# print("strategy_prompt_name: ",template_prompt_name, "\n","strategy_prompt:",prompt_template)
# 获取模型响应,带重试机制
retry_count = 0
last_error = None
while retry_count <= MAX_RETRIES:
try:
llm_chain = LLMChain(prompt=prompt, llm=model, verbose=True)
model_response = llm_chain.run(prompt_param_dict)
# print(f'---------after {strategy_name}------------------')
# print(model_response)
return model_response
except Exception as e:
last_error = e
retry_count += 1
if retry_count > MAX_RETRIES:
logger.error(f"LLM调用失败,已达到最大重试次数 {MAX_RETRIES}: {e}")
raise
logger.warning(f"LLM调用第 {retry_count} 次失败,{RETRY_DELAY}秒后重试: {e}")
time.sleep(RETRY_DELAY)
# 重新创建 model,关闭 streaming
model = get_ChatOpenAI(
model_name=llm_model_name,
temperature=temperature,
max_tokens=max_tokens,
callbacks=[],
streaming=False,
**kwargs
)
async def get_llm_model_response_async(
strategy_name: str,
llm_model_name: str,
template_prompt_name: str,
prompt_param_dict: dict,
temperature: float,
max_tokens: int,
) -> str:
'''异步调用大模型,实现不同策略'''
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
get_llm_model_response,
strategy_name,
llm_model_name,
template_prompt_name,
prompt_param_dict,
temperature,
max_tokens
)
async def get_llm_model_response_stream_openai(
type: int,
strategy_name: str,
llm_model_name: str,
template_prompt_name: str,
prompt_param_dict: dict,
temperature: float,
max_tokens: int,
) -> AsyncGenerator[str, None]:
# 校验 max_tokens
if max_tokens is not None and max_tokens > MAX_MAX_TOKENS:
max_tokens = MAX_MAX_TOKENS
retry_count = 0
while retry_count <= MAX_RETRIES:
try:
if type == 0 or type == 2:
kwargs = {}
kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
model = get_ChatOpenAI(
model_name=llm_model_name,
temperature=temperature,
max_tokens=max_tokens,
callbacks=[],
**kwargs
)
else:
model = get_ChatOpenAI(
model_name=llm_model_name,
temperature=temperature,
max_tokens=max_tokens,
callbacks=[],
)
# 调用流式接口
if type == 0:
prompt_template1 = get_prompt_template("agent_chat", "Think Test Bak")
if type == 1:
prompt_template1 = get_prompt_template("agent_chat", "Write Test Bak")
if type == 2:
prompt_template = get_prompt_template("llm_chat", template_prompt_name)
else:
prompt_template2 = get_prompt_template("agent_chat", template_prompt_name)
prompt_template = f"{prompt_template1}{prompt_template2}"
for key in prompt_param_dict:
prompt_template = prompt_template.replace(f"{{{{{key}}}}}", prompt_param_dict[key])
messages = [HumanMessage(content=prompt_template)]
# 跳过 ... 块,其余照常 yield
# 兼容 R1 等输出 think 块的模型;非 think 模型不受影响
in_think = False
async for chunk in model.astream(messages):
text = chunk.content or ""
while text:
if not in_think:
i = text.find("")
if i < 0:
yield text
break
if i > 0:
yield text[:i]
text = text[i + len(""):]
in_think = True
else:
i = text.find("")
if i < 0:
text = "" # 全在 think 块内,丢弃
else:
text = text[i + len(""):]
in_think = False
return # 成功完成,退出函数
except Exception as e:
retry_count += 1
if retry_count > MAX_RETRIES:
logger.error(f"流式LLM调用失败,已达到最大重试次数 {MAX_RETRIES}: {e}")
raise
logger.warning(f"流式LLM调用第 {retry_count} 次失败,{RETRY_DELAY}秒后重试: {e}")
await asyncio.sleep(RETRY_DELAY)