translate not fin

2023-05-19 23:52:20 +08:00 · 2023-05-19 23:52:20 +08:00 · c376e46f4d
commit c376e46f4d
parent 8d528190a9
3 changed files with 792 additions and 114 deletions
--- a/docs/translate_english.json
+++ b/docs/translate_english.json
--- a/multi_language.py
+++ b/multi_language.py
@ -1,10 +1,13 @@
 import os
+import json
 import functools
 import re
 import pickle
 import time

 CACHE_FOLDER = "gpt_log"
+blacklist = ['multi-language', 'gpt_log', '.git', 'private_upload']
+LANG = "English"

 if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)
@ -78,7 +81,6 @@ def lru_file_cache(maxsize=128, ttl=None, filename=None):

    return decorator_function

-
 def contains_chinese(string):
    """
    Returns True if the given string contains Chinese characters, False otherwise.
@ -86,122 +88,259 @@ def contains_chinese(string):
    chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
    return chinese_regex.search(string) is not None

-def extract_chinese_characters(file_path):
-    syntax = []
-    with open(file_path, 'r', encoding='utf-8') as f:
-        content = f.read()
-        import ast
-        root = ast.parse(content)
-        for node in ast.walk(root):
-            if isinstance(node, ast.Name):
-                if contains_chinese(node.id):
-                    print(node.id)
-                    syntax.append(node)
+def split_list(lst, n_each_req):
+    """
+    Split a list into smaller lists, each with a maximum number of elements.
+    :param lst: the list to split
+    :param n_each_req: the maximum number of elements in each sub-list
+    :return: a list of sub-lists
+    """
+    result = []
+    for i in range(0, len(lst), n_each_req):
+        result.append(lst[i:i + n_each_req])
+    return result

-        return syntax
+def map_to_json(map, language):
+    dict_ = read_map_from_json(language)
+    dict_.update(map)
+    with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
+        json.dump(dict_, f, indent=4, ensure_ascii=False)

-def extract_chinese_characters_from_directory(directory_path):
-    chinese_characters = []
+def read_map_from_json(language):
+    if os.path.exists(f'docs/translate_{language.lower()}.json'):
+        with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f: 
+            return json.load(f)
+    return {}
+
+cached_translation = {}
+cached_translation = read_map_from_json(language=LANG)
+
+@lru_file_cache(maxsize=10, ttl=1e40, filename="translation_cache")
+def trans(word_to_translate, language, special=False):
+    if len(word_to_translate) == 0: return {}
+    from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
+    from toolbox import get_conf, ChatBotWithCookies
+    proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
+        get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
+    llm_kwargs = {
+        'api_key': API_KEY,
+        'llm_model': LLM_MODEL,
+        'top_p':1.0, 
+        'max_length': None,
+        'temperature':0.0,
+    }
+    N_EACH_REQ = 16
+    word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
+    inputs_array = [str(s) for s in word_to_translate_split]
+    inputs_show_user_array = inputs_array
+    history_array = [[] for _ in inputs_array]
+    if special: #  to English using CamelCase Naming Convention
+        sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
+    else:
+        sys_prompt_array = [f"Translate following sentences to {LANG}. Keep original format." for _ in inputs_array]
+    chatbot = ChatBotWithCookies(llm_kwargs)
+    gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
+        inputs_array, 
+        inputs_show_user_array, 
+        llm_kwargs, 
+        chatbot, 
+        history_array, 
+        sys_prompt_array, 
+    )
+    while True:
+        try:
+            gpt_say = next(gpt_say_generator)
+            print(gpt_say[1][0][1])
+        except StopIteration as e:
+            result = e.value
+            break
+    translated_result = {}
+    for i, r in enumerate(result):
+        if i%2 == 1:
+            try:
+                res_before_trans = eval(result[i-1])
+                res_after_trans = eval(result[i])
+                for a,b in zip(res_before_trans, res_after_trans):
+                    translated_result[a] = b
+            except:
+                try:
+                    res_before_trans = eval(result[i-1])
+                    result[i] = result[i].strip('[\']')
+                    res_after_trans = [s for s in result[i].split("', '")]
+                    for a,b in zip(res_before_trans, res_after_trans):
+                        translated_result[a] = b
+                except:
+                    res_before_trans = eval(result[i-1])
+                    for a in res_before_trans:
+                        translated_result[a] = None
+    return translated_result
+
+def step_1_core_key_translate():
+    def extract_chinese_characters(file_path):
+        syntax = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+            import ast
+            root = ast.parse(content)
+            for node in ast.walk(root):
+                if isinstance(node, ast.Name):
+                    if contains_chinese(node.id): syntax.append(node.id)
+                if isinstance(node, ast.Import):
+                    for n in node.names:
+                        if contains_chinese(n.name): syntax.append(n.name)
+                elif isinstance(node, ast.ImportFrom):
+                    for n in node.names:
+                        if contains_chinese(n.name): syntax.append(n.name)
+                        for k in node.module.split('.'):
+                            if contains_chinese(k): syntax.append(k)
+            return syntax
+
+    def extract_chinese_characters_from_directory(directory_path):
+        chinese_characters = []
+        for root, dirs, files in os.walk(directory_path):
+            if any([b in root for b in blacklist]):
+                continue
+            for file in files:
+                if file.endswith('.py'):
+                    file_path = os.path.join(root, file)
+                    chinese_characters.extend(extract_chinese_characters(file_path))
+        return chinese_characters
+
+    directory_path = './'
+    chinese_core_names = extract_chinese_characters_from_directory(directory_path)
+    chinese_core_keys = [name for name in chinese_core_names]
+    chinese_core_keys_norepeat = []
+    for d in chinese_core_keys:
+        if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
+    need_translate = []
+    cached_translation = read_map_from_json(language=LANG)
+    cached_translation_keys = list(cached_translation.keys())
+    for d in chinese_core_keys_norepeat:
+        if d not in cached_translation_keys: 
+            need_translate.append(d)
+
+    need_translate_mapping = trans(need_translate, language=LANG, special=True)
+    map_to_json(need_translate_mapping, language=LANG)
+    cached_translation = read_map_from_json(language=LANG)
+    cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))
+
+    chinese_core_keys_norepeat_mapping = {}
+    for k in chinese_core_keys_norepeat:
+        chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
+
+    # ===============================================
+    # copy
+    # ===============================================
+    def copy_source_code():
+
+        from toolbox import get_conf
+        import shutil
+        import os
+        try: shutil.rmtree(f'./multi-language/{LANG}/')
+        except: pass
+        os.makedirs(f'./multi-language', exist_ok=True)
+        backup_dir = f'./multi-language/{LANG}/'
+        shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
+    copy_source_code()
+
+    # ===============================================
+    # primary key replace
+    # ===============================================
+    directory_path = f'./multi-language/{LANG}/'
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
-                chinese_characters.extend(extract_chinese_characters(file_path))
-    return chinese_characters
+                syntax = []
+                # read again
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                
+                for k, v in chinese_core_keys_norepeat_mapping.items():
+                    content = content.replace(k, v)

-directory_path = './'
-chinese_characters = extract_chinese_characters_from_directory(directory_path)
-word_to_translate = {}
-for d in chinese_characters:
-    word_to_translate[d['word']] = "TRANS"
-
-def break_dictionary(d, n):
-    items = list(d.items())
-    num_dicts = (len(items) + n - 1) // n
-    return [{k: v for k, v in items[i*n:(i+1)*n]} for i in range(num_dicts)]
-
-N_EACH_REQ = 50
-word_to_translate_split = break_dictionary(word_to_translate, N_EACH_REQ)
-LANG = "English"
-
-@lru_file_cache(maxsize=10, ttl=1e40, filename="translation_cache")
-def trans(words):
-    # from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
-    # from toolbox import get_conf, ChatBotWithCookies
-    # proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
-    #     get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
-    # llm_kwargs = {
-    #     'api_key': API_KEY,
-    #     'llm_model': LLM_MODEL,
-    #     'top_p':1.0, 
-    #     'max_length': None,
-    #     'temperature':0.0,
-    # }
-    # plugin_kwargs = {}
-    # chatbot = ChatBotWithCookies(llm_kwargs)
-    # history = []
-    # for gpt_say in request_gpt_model_in_new_thread_with_ui_alive(
-    #     inputs=words, inputs_show_user=words, 
-    #     llm_kwargs=llm_kwargs, chatbot=chatbot, history=[], 
-    #     sys_prompt=f"Translate following words to {LANG}, replace `TRANS` with translated result."
-    # ):
-    #     gpt_say = gpt_say[1][0][1]
-    # return gpt_say
-    return '{}'
-
-translated_result = {}
-for d in word_to_translate_split:
-    res = trans(str(d))
-    try:
-        # convert translated result back to python dictionary
-        res_dict = eval(res)
-    except:
-        print('Unexpected output.')
-    translated_result.update(res_dict)
-
-print('All Chinese characters:', chinese_characters)
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    f.write(content)


-# =================== create copy =====================
-def copy_source_code():
-    """
-    一键更新协议：备份和下载
-    """
-    from toolbox import get_conf
-    import shutil
-    import os
-    import requests
-    import zipfile
-    try: shutil.rmtree(f'./multi-language/{LANG}/')
-    except: pass
-    os.makedirs(f'./multi-language', exist_ok=True)
-    backup_dir = f'./multi-language/{LANG}/'
-    shutil.copytree('./', backup_dir, ignore=lambda x, y: ['multi-language', 'gpt_log', '.git', 'private_upload'])
-copy_source_code()
+def step_2_core_key_translate():
+
+    # =================================================================================================
+    # step2 
+    # =================================================================================================
+    def get_strings(node):
+        strings = []
+
+        # recursively traverse the AST
+        for child in ast.iter_child_nodes(node):
+            if isinstance(child, ast.Str):
+                if contains_chinese(child.s):
+                    string_ = child.s.strip().strip(',').strip().strip('.').strip()
+                    if string_.startswith('[Local Message]'):
+                        string_ = string_.replace('[Local Message]', '')
+                        string_ = string_.strip().strip(',').strip().strip('.').strip()
+                    strings.append([
+                        string_, 
+                        child.lineno*10000+child.col_offset
+                    ])
+            elif isinstance(child, ast.AST):
+                strings.extend(get_strings(child))
+
+        return strings
+
+    string_literals = []
+    directory_path = f'./multi-language/{LANG}/'
+    for root, dirs, files in os.walk(directory_path):
+        for file in files:
+            if file.endswith('.py'):
+                file_path = os.path.join(root, file)
+                syntax = []
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    import ast
+                    tree = ast.parse(content)
+                    res = get_strings(tree)
+                    string_literals.extend(res)
+
+    chinese_literal_names = []
+    chinese_literal_names_norepeat = []
+    for string, offset in string_literals:
+        chinese_literal_names.append(string)
+    chinese_literal_names_norepeat = []
+    for d in chinese_literal_names:
+        if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
+    need_translate = []
+    cached_translation = read_map_from_json(language=LANG)
+    cached_translation_keys = list(cached_translation.keys())
+    for d in chinese_literal_names_norepeat:
+        if d not in cached_translation_keys: 
+            need_translate.append(d)


-for d in chinese_characters:
-    d['file'] = f'./multi-language/{LANG}/' + d['file']
-    if d['word'] in translated_result:
-        d['trans'] = translated_result[d['word']]
-    else:
-        d['trans'] = None
+    up = trans(need_translate, language=LANG, special=False)
+    map_to_json(up, language=LANG)
+    cached_translation = read_map_from_json(language=LANG)
+    cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))

-chinese_characters = sorted(chinese_characters, key=lambda x: len(x['word']), reverse=True)
-for d in chinese_characters:
-    if d['trans'] is None:
-        continue
-    
-
-
-    with open(d['file'], 'r', encoding='utf-8') as f:
-        content = f.read()
-    
-    content.replace(d['word'], d['trans'])
-    substring = d['trans']
-    substring_start_index = content.find(substring)
-    substring_end_index = substring_start_index + len(substring) - 1
-    if content[substring_start_index].isalpha() or content[substring_start_index].isdigit():
-        content = content[:substring_start_index+1]
+    # ===============================================
+    # literal key replace
+    # ===============================================
+    directory_path = f'./multi-language/{LANG}/'
+    for root, dirs, files in os.walk(directory_path):
+        for file in files:
+            if file.endswith('.py'):
+                file_path = os.path.join(root, file)
+                syntax = []
+                # read again
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                
+                for k, v in cached_translation.items():
+                    content = content.replace(k, v)
+
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    f.write(content)


+step_1_core_key_translate()
+step_2_core_key_translate()
--- a/theme.py
+++ b/theme.py
@ -103,35 +103,30 @@ def adjust_theme():


 advanced_css = """
-/* 设置表格的外边距为1em，内部单元格之间边框合并，空单元格显示. */
 .markdown-body table {
    margin: 1em 0;
    border-collapse: collapse;
    empty-cells: show;
 }

-/* 设置表格单元格的内边距为5px，边框粗细为1.2px，颜色为--border-color-primary. */
 .markdown-body th, .markdown-body td {
    border: 1.2px solid var(--border-color-primary);
    padding: 5px;
 }

-/* 设置表头背景颜色为rgba(175,184,193,0.2)，透明度为0.2. */
 .markdown-body thead {
    background-color: rgba(175,184,193,0.2);
 }

-/* 设置表头单元格的内边距为0.5em和0.2em. */
 .markdown-body thead th {
    padding: .5em .2em;
 }

-/* 去掉列表前缀的默认间距，使其与文本线对齐. */
 .markdown-body ol, .markdown-body ul {
    padding-inline-start: 2em !important;
 }

-/* 设定聊天气泡的样式，包括圆角、最大宽度和阴影等. */
+/* chat box. */
 [class *= "message"] {
    border-radius: var(--radius-xl) !important;
    /* padding: var(--spacing-xl) !important; */
@ -151,7 +146,7 @@ advanced_css = """
    border-bottom-right-radius: 0 !important;
 }

-/* 行内代码的背景设为淡灰色，设定圆角和间距. */
+/* linein code block. */
 .markdown-body code {
    display: inline;
    white-space: break-spaces;
@ -171,7 +166,7 @@ advanced_css = """
    background-color: rgba(175,184,193,0.2);
 }

-/* 设定代码块的样式，包括背景颜色、内、外边距、圆角。 */
+/* code block css */
 .markdown-body pre code {
    display: block;
    overflow: auto;