Files
gangyan/langchain-chat/server/chat/check_language.py

106 lines
3.1 KiB
Python
Raw Normal View History

import re
from fastapi.responses import JSONResponse
from configs import logger
from pydantic import BaseModel
class CheckLanguage(BaseModel):
query: str
def check_language(request: CheckLanguage) -> JSONResponse:
'''
语种检测逻辑当中文占比大于50%则判定翻译为英文否则为中文
'''
query = request.query
# 移除所有数字和空白字符
query = re.sub(r'[\d\s]', '', query)
if not query: # 如果文本为空,返回报错
return JSONResponse(content={
"code": 500,
"message": "输入为空,请重试"
},
status_code=500)
word_count = count_words(query)
chinese_count = word_count['chinese_chars']
english_word_count = word_count['english_words']
total_count = word_count['total_count']
chinese_ratio = chinese_count / total_count if total_count > 0 else 0
english_ratio = english_word_count / total_count if total_count > 0 else 0
logger.info(f"🔍[语言检测] 中文字符比例: {chinese_ratio:.2f}, 英文单词比例: {english_ratio:.2f}")
is_chinese = chinese_ratio > 0.5
# is_english = english_ratio > 0.1
try:
if is_chinese:
return JSONResponse(content={
"code": 200,
"message": "成功",
"query": query,
"to_lang": "en"
})
else:
return JSONResponse(content={
"code": 200,
"message": "成功",
"query": query,
"to_lang": "zh-cn"
})
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
def count_words(query: str) -> dict:
# 统计中文字符
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', query))
# 统计英文单词
english_words = len(re.findall(r'[a-zA-Z]+', query))
return {
"total_count": chinese_chars + english_words,
"chinese_chars": chinese_chars,
"english_words": english_words
}
def get_supported_languages():
'''
获取当前支持的语言\n
code: 翻译时需要传入的to_lang参数的值\n
name: 需要展示的语言名称
'''
language_mapping = {
"zh-cn": "中文",
"en": "英语",
"ja": "日语",
"ko": "韩语",
"fr": "法语",
"de": "德语",
"es": "西班牙语",
"it": "意大利语",
"pt": "葡萄牙语",
"ru": "俄语",
"ar": "阿拉伯语",
"hi": "印地语",
"bn": "孟加拉语",
"pa": "旁遮普语",
"jv": "爪哇语",
"ms": "马来语",
"vi": "越南语",
"th": "泰语",
"tr": "土耳其语",
"fa": "波斯语",
"pl": "波兰语",
"uk": "乌克兰语",
"ro": "罗马尼亚语",
"nl": "荷兰语",
"el": "希腊语"
}
result = {
"languages": [{"code": code, "name": name} for code, name in language_mapping.items()]
}
return JSONResponse(
content=result
)