translate not fin

This commit is contained in:
505030475 2023-05-19 23:52:20 +08:00
parent 8d528190a9
commit c376e46f4d
3 changed files with 792 additions and 114 deletions

544
docs/translate_english.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,10 +1,13 @@
import os
import json
import functools
import re
import pickle
import time
CACHE_FOLDER = "gpt_log"
blacklist = ['multi-language', 'gpt_log', '.git', 'private_upload']
LANG = "English"
if not os.path.exists(CACHE_FOLDER):
os.makedirs(CACHE_FOLDER)
@ -78,7 +81,6 @@ def lru_file_cache(maxsize=128, ttl=None, filename=None):
return decorator_function
def contains_chinese(string):
"""
Returns True if the given string contains Chinese characters, False otherwise.
@ -86,122 +88,259 @@ def contains_chinese(string):
chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
return chinese_regex.search(string) is not None
def extract_chinese_characters(file_path):
syntax = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
import ast
root = ast.parse(content)
for node in ast.walk(root):
if isinstance(node, ast.Name):
if contains_chinese(node.id):
print(node.id)
syntax.append(node)
def split_list(lst, n_each_req):
"""
Split a list into smaller lists, each with a maximum number of elements.
:param lst: the list to split
:param n_each_req: the maximum number of elements in each sub-list
:return: a list of sub-lists
"""
result = []
for i in range(0, len(lst), n_each_req):
result.append(lst[i:i + n_each_req])
return result
return syntax
def map_to_json(map, language):
dict_ = read_map_from_json(language)
dict_.update(map)
with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
json.dump(dict_, f, indent=4, ensure_ascii=False)
def extract_chinese_characters_from_directory(directory_path):
chinese_characters = []
def read_map_from_json(language):
if os.path.exists(f'docs/translate_{language.lower()}.json'):
with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
return json.load(f)
return {}
cached_translation = {}
cached_translation = read_map_from_json(language=LANG)
@lru_file_cache(maxsize=10, ttl=1e40, filename="translation_cache")
def trans(word_to_translate, language, special=False):
if len(word_to_translate) == 0: return {}
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from toolbox import get_conf, ChatBotWithCookies
proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
llm_kwargs = {
'api_key': API_KEY,
'llm_model': LLM_MODEL,
'top_p':1.0,
'max_length': None,
'temperature':0.0,
}
N_EACH_REQ = 16
word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
inputs_array = [str(s) for s in word_to_translate_split]
inputs_show_user_array = inputs_array
history_array = [[] for _ in inputs_array]
if special: # to English using CamelCase Naming Convention
sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
else:
sys_prompt_array = [f"Translate following sentences to {LANG}. Keep original format." for _ in inputs_array]
chatbot = ChatBotWithCookies(llm_kwargs)
gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array,
inputs_show_user_array,
llm_kwargs,
chatbot,
history_array,
sys_prompt_array,
)
while True:
try:
gpt_say = next(gpt_say_generator)
print(gpt_say[1][0][1])
except StopIteration as e:
result = e.value
break
translated_result = {}
for i, r in enumerate(result):
if i%2 == 1:
try:
res_before_trans = eval(result[i-1])
res_after_trans = eval(result[i])
for a,b in zip(res_before_trans, res_after_trans):
translated_result[a] = b
except:
try:
res_before_trans = eval(result[i-1])
result[i] = result[i].strip('[\']')
res_after_trans = [s for s in result[i].split("', '")]
for a,b in zip(res_before_trans, res_after_trans):
translated_result[a] = b
except:
res_before_trans = eval(result[i-1])
for a in res_before_trans:
translated_result[a] = None
return translated_result
def step_1_core_key_translate():
def extract_chinese_characters(file_path):
syntax = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
import ast
root = ast.parse(content)
for node in ast.walk(root):
if isinstance(node, ast.Name):
if contains_chinese(node.id): syntax.append(node.id)
if isinstance(node, ast.Import):
for n in node.names:
if contains_chinese(n.name): syntax.append(n.name)
elif isinstance(node, ast.ImportFrom):
for n in node.names:
if contains_chinese(n.name): syntax.append(n.name)
for k in node.module.split('.'):
if contains_chinese(k): syntax.append(k)
return syntax
def extract_chinese_characters_from_directory(directory_path):
chinese_characters = []
for root, dirs, files in os.walk(directory_path):
if any([b in root for b in blacklist]):
continue
for file in files:
if file.endswith('.py'):
file_path = os.path.join(root, file)
chinese_characters.extend(extract_chinese_characters(file_path))
return chinese_characters
directory_path = './'
chinese_core_names = extract_chinese_characters_from_directory(directory_path)
chinese_core_keys = [name for name in chinese_core_names]
chinese_core_keys_norepeat = []
for d in chinese_core_keys:
if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
need_translate = []
cached_translation = read_map_from_json(language=LANG)
cached_translation_keys = list(cached_translation.keys())
for d in chinese_core_keys_norepeat:
if d not in cached_translation_keys:
need_translate.append(d)
need_translate_mapping = trans(need_translate, language=LANG, special=True)
map_to_json(need_translate_mapping, language=LANG)
cached_translation = read_map_from_json(language=LANG)
cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
chinese_core_keys_norepeat_mapping = {}
for k in chinese_core_keys_norepeat:
chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
# ===============================================
# copy
# ===============================================
def copy_source_code():
from toolbox import get_conf
import shutil
import os
try: shutil.rmtree(f'./multi-language/{LANG}/')
except: pass
os.makedirs(f'./multi-language', exist_ok=True)
backup_dir = f'./multi-language/{LANG}/'
shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
copy_source_code()
# ===============================================
# primary key replace
# ===============================================
directory_path = f'./multi-language/{LANG}/'
for root, dirs, files in os.walk(directory_path):
for file in files:
if file.endswith('.py'):
file_path = os.path.join(root, file)
chinese_characters.extend(extract_chinese_characters(file_path))
return chinese_characters
syntax = []
# read again
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
for k, v in chinese_core_keys_norepeat_mapping.items():
content = content.replace(k, v)
directory_path = './'
chinese_characters = extract_chinese_characters_from_directory(directory_path)
word_to_translate = {}
for d in chinese_characters:
word_to_translate[d['word']] = "TRANS"
def break_dictionary(d, n):
items = list(d.items())
num_dicts = (len(items) + n - 1) // n
return [{k: v for k, v in items[i*n:(i+1)*n]} for i in range(num_dicts)]
N_EACH_REQ = 50
word_to_translate_split = break_dictionary(word_to_translate, N_EACH_REQ)
LANG = "English"
@lru_file_cache(maxsize=10, ttl=1e40, filename="translation_cache")
def trans(words):
# from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
# from toolbox import get_conf, ChatBotWithCookies
# proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
# get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
# llm_kwargs = {
# 'api_key': API_KEY,
# 'llm_model': LLM_MODEL,
# 'top_p':1.0,
# 'max_length': None,
# 'temperature':0.0,
# }
# plugin_kwargs = {}
# chatbot = ChatBotWithCookies(llm_kwargs)
# history = []
# for gpt_say in request_gpt_model_in_new_thread_with_ui_alive(
# inputs=words, inputs_show_user=words,
# llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
# sys_prompt=f"Translate following words to {LANG}, replace `TRANS` with translated result."
# ):
# gpt_say = gpt_say[1][0][1]
# return gpt_say
return '{}'
translated_result = {}
for d in word_to_translate_split:
res = trans(str(d))
try:
# convert translated result back to python dictionary
res_dict = eval(res)
except:
print('Unexpected output.')
translated_result.update(res_dict)
print('All Chinese characters:', chinese_characters)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
# =================== create copy =====================
def copy_source_code():
"""
一键更新协议备份和下载
"""
from toolbox import get_conf
import shutil
import os
import requests
import zipfile
try: shutil.rmtree(f'./multi-language/{LANG}/')
except: pass
os.makedirs(f'./multi-language', exist_ok=True)
backup_dir = f'./multi-language/{LANG}/'
shutil.copytree('./', backup_dir, ignore=lambda x, y: ['multi-language', 'gpt_log', '.git', 'private_upload'])
copy_source_code()
def step_2_core_key_translate():
# =================================================================================================
# step2
# =================================================================================================
def get_strings(node):
strings = []
# recursively traverse the AST
for child in ast.iter_child_nodes(node):
if isinstance(child, ast.Str):
if contains_chinese(child.s):
string_ = child.s.strip().strip(',').strip().strip('.').strip()
if string_.startswith('[Local Message]'):
string_ = string_.replace('[Local Message]', '')
string_ = string_.strip().strip(',').strip().strip('.').strip()
strings.append([
string_,
child.lineno*10000+child.col_offset
])
elif isinstance(child, ast.AST):
strings.extend(get_strings(child))
return strings
string_literals = []
directory_path = f'./multi-language/{LANG}/'
for root, dirs, files in os.walk(directory_path):
for file in files:
if file.endswith('.py'):
file_path = os.path.join(root, file)
syntax = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
import ast
tree = ast.parse(content)
res = get_strings(tree)
string_literals.extend(res)
chinese_literal_names = []
chinese_literal_names_norepeat = []
for string, offset in string_literals:
chinese_literal_names.append(string)
chinese_literal_names_norepeat = []
for d in chinese_literal_names:
if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
need_translate = []
cached_translation = read_map_from_json(language=LANG)
cached_translation_keys = list(cached_translation.keys())
for d in chinese_literal_names_norepeat:
if d not in cached_translation_keys:
need_translate.append(d)
for d in chinese_characters:
d['file'] = f'./multi-language/{LANG}/' + d['file']
if d['word'] in translated_result:
d['trans'] = translated_result[d['word']]
else:
d['trans'] = None
up = trans(need_translate, language=LANG, special=False)
map_to_json(up, language=LANG)
cached_translation = read_map_from_json(language=LANG)
cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
chinese_characters = sorted(chinese_characters, key=lambda x: len(x['word']), reverse=True)
for d in chinese_characters:
if d['trans'] is None:
continue
with open(d['file'], 'r', encoding='utf-8') as f:
content = f.read()
content.replace(d['word'], d['trans'])
substring = d['trans']
substring_start_index = content.find(substring)
substring_end_index = substring_start_index + len(substring) - 1
if content[substring_start_index].isalpha() or content[substring_start_index].isdigit():
content = content[:substring_start_index+1]
# ===============================================
# literal key replace
# ===============================================
directory_path = f'./multi-language/{LANG}/'
for root, dirs, files in os.walk(directory_path):
for file in files:
if file.endswith('.py'):
file_path = os.path.join(root, file)
syntax = []
# read again
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
for k, v in cached_translation.items():
content = content.replace(k, v)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
step_1_core_key_translate()
step_2_core_key_translate()

View File

@ -103,35 +103,30 @@ def adjust_theme():
advanced_css = """
/* 设置表格的外边距为1em内部单元格之间边框合并空单元格显示. */
.markdown-body table {
margin: 1em 0;
border-collapse: collapse;
empty-cells: show;
}
/* 设置表格单元格的内边距为5px边框粗细为1.2px颜色为--border-color-primary. */
.markdown-body th, .markdown-body td {
border: 1.2px solid var(--border-color-primary);
padding: 5px;
}
/* 设置表头背景颜色为rgba(175,184,193,0.2)透明度为0.2. */
.markdown-body thead {
background-color: rgba(175,184,193,0.2);
}
/* 设置表头单元格的内边距为0.5em和0.2em. */
.markdown-body thead th {
padding: .5em .2em;
}
/* 去掉列表前缀的默认间距使其与文本线对齐. */
.markdown-body ol, .markdown-body ul {
padding-inline-start: 2em !important;
}
/* 设定聊天气泡的样式包括圆角最大宽度和阴影等. */
/* chat box. */
[class *= "message"] {
border-radius: var(--radius-xl) !important;
/* padding: var(--spacing-xl) !important; */
@ -151,7 +146,7 @@ advanced_css = """
border-bottom-right-radius: 0 !important;
}
/* 行内代码的背景设为淡灰色设定圆角和间距. */
/* linein code block. */
.markdown-body code {
display: inline;
white-space: break-spaces;
@ -171,7 +166,7 @@ advanced_css = """
background-color: rgba(175,184,193,0.2);
}
/* 设定代码块的样式包括背景颜色外边距圆角 */
/* code block css */
.markdown-body pre code {
display: block;
overflow: auto;