Files
gangyan/langchain-chat/server/chat/check_language.py

106 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from fastapi.responses import JSONResponse
from configs import logger
from pydantic import BaseModel
class CheckLanguage(BaseModel):
query: str
def check_language(request: CheckLanguage) -> JSONResponse:
'''
语种检测逻辑当中文占比大于50%,则判定翻译为英文,否则为中文
'''
query = request.query
# 移除所有数字和空白字符
query = re.sub(r'[\d\s]', '', query)
if not query: # 如果文本为空,返回报错
return JSONResponse(content={
"code": 500,
"message": "输入为空,请重试"
},
status_code=500)
word_count = count_words(query)
chinese_count = word_count['chinese_chars']
english_word_count = word_count['english_words']
total_count = word_count['total_count']
chinese_ratio = chinese_count / total_count if total_count > 0 else 0
english_ratio = english_word_count / total_count if total_count > 0 else 0
logger.info(f"🔍[语言检测] 中文字符比例: {chinese_ratio:.2f}, 英文单词比例: {english_ratio:.2f}")
is_chinese = chinese_ratio > 0.5
# is_english = english_ratio > 0.1
try:
if is_chinese:
return JSONResponse(content={
"code": 200,
"message": "成功",
"query": query,
"to_lang": "en"
})
else:
return JSONResponse(content={
"code": 200,
"message": "成功",
"query": query,
"to_lang": "zh-cn"
})
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
def count_words(query: str) -> dict:
# 统计中文字符
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', query))
# 统计英文单词
english_words = len(re.findall(r'[a-zA-Z]+', query))
return {
"total_count": chinese_chars + english_words,
"chinese_chars": chinese_chars,
"english_words": english_words
}
def get_supported_languages():
'''
获取当前支持的语言。\n
code: 翻译时需要传入的to_lang参数的值\n
name: 需要展示的语言名称
'''
language_mapping = {
"zh-cn": "中文",
"en": "英语",
"ja": "日语",
"ko": "韩语",
"fr": "法语",
"de": "德语",
"es": "西班牙语",
"it": "意大利语",
"pt": "葡萄牙语",
"ru": "俄语",
"ar": "阿拉伯语",
"hi": "印地语",
"bn": "孟加拉语",
"pa": "旁遮普语",
"jv": "爪哇语",
"ms": "马来语",
"vi": "越南语",
"th": "泰语",
"tr": "土耳其语",
"fa": "波斯语",
"pl": "波兰语",
"uk": "乌克兰语",
"ro": "罗马尼亚语",
"nl": "荷兰语",
"el": "希腊语"
}
result = {
"languages": [{"code": code, "name": name} for code, name in language_mapping.items()]
}
return JSONResponse(
content=result
)