chatgpt_academic/multi_language.py

"""
    Translate this project to other languages (experimental, please open an issue if there is any bug)


    Usage:
        1. modify config.py, set your LLM_MODEL and API_KEY(s) to provide access to OPENAI (or any other LLM model provider)

        2. modify LANG (below ↓)
            LANG = "English"

        3. modify TransPrompt (below ↓)
            TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."

        4. Run `python multi_language.py`.
            Note: You need to run it multiple times to increase translation coverage because GPT makes mistakes sometimes.

        5. Find the translated program in `multi-language\English\*`

    P.S.

        - The translation mapping will be stored in `docs/translation_xxxx.json`, you can revised mistaken translation there.

        - If you would like to share your `docs/translation_xxxx.json`, (so that everyone can use the cached & revised translation mapping), please open a Pull Request

        - If there is any translation error in `docs/translation_xxxx.json`, please open a Pull Request

        - Welcome any Pull Request, regardless of language
"""

import os
import json
import functools
import re
import pickle
import time
from toolbox import get_conf

CACHE_FOLDER, = get_conf('PATH_LOGGING')

blacklist = ['multi-language', CACHE_FOLDER, '.git', 'private_upload', 'multi_language.py', 'build', '.github', '.vscode', '__pycache__', 'venv']

# LANG = "TraditionalChinese"
# TransPrompt = f"Replace each json value `#` with translated results in Traditional Chinese, e.g., \"原始文本\":\"翻譯後文字\". Keep Json format. Do not answer #."

# LANG = "Japanese"
# TransPrompt = f"Replace each json value `#` with translated results in Japanese, e.g., \"原始文本\":\"テキストの翻訳\". Keep Json format. Do not answer #."

LANG = "English"
TransPrompt = f"Replace each json value `#` with translated results in English, e.g., \"原始文本\":\"TranslatedText\". Keep Json format. Do not answer #."


if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)


def lru_file_cache(maxsize=128, ttl=None, filename=None):
    """
    Decorator that caches a function's return value after being called with given arguments.
    It uses a Least Recently Used (LRU) cache strategy to limit the size of the cache.
    maxsize: Maximum size of the cache. Defaults to 128.
    ttl: Time-to-Live of the cache. If a value hasn't been accessed for `ttl` seconds, it will be evicted from the cache.
    filename: Name of the file to store the cache in. If not supplied, the function name + ".cache" will be used.
    """
    cache_path = os.path.join(CACHE_FOLDER, f"{filename}.cache") if filename is not None else None

    def decorator_function(func):
        cache = {}
        _cache_info = {
            "hits": 0,
            "misses": 0,
            "maxsize": maxsize,
            "currsize": 0,
            "ttl": ttl,
            "filename": cache_path,
        }

        @functools.wraps(func)
        def wrapper_function(*args, **kwargs):
            key = str((args, frozenset(kwargs)))
            if key in cache:
                if _cache_info["ttl"] is None or (cache[key][1] + _cache_info["ttl"]) >= time.time():
                    _cache_info["hits"] += 1
                    print(f'Warning, reading cache, last read {(time.time()-cache[key][1])//60} minutes ago'); time.sleep(2)
                    cache[key][1] = time.time()
                    return cache[key][0]
                else:
                    del cache[key]

            result = func(*args, **kwargs)
            cache[key] = [result, time.time()]
            _cache_info["misses"] += 1
            _cache_info["currsize"] += 1

            if _cache_info["currsize"] > _cache_info["maxsize"]:
                oldest_key = None
                for k in cache:
                    if oldest_key is None:
                        oldest_key = k
                    elif cache[k][1] < cache[oldest_key][1]:
                        oldest_key = k
                del cache[oldest_key]
                _cache_info["currsize"] -= 1

            if cache_path is not None:
                with open(cache_path, "wb") as f:
                    pickle.dump(cache, f)

            return result

        def cache_info():
            return _cache_info

        wrapper_function.cache_info = cache_info

        if cache_path is not None and os.path.exists(cache_path):
            with open(cache_path, "rb") as f:
                cache = pickle.load(f)
            _cache_info["currsize"] = len(cache)

        return wrapper_function

    return decorator_function

def contains_chinese(string):
    """
    Returns True if the given string contains Chinese characters, False otherwise.
    """
    chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
    return chinese_regex.search(string) is not None

def split_list(lst, n_each_req):
    """
    Split a list into smaller lists, each with a maximum number of elements.
    :param lst: the list to split
    :param n_each_req: the maximum number of elements in each sub-list
    :return: a list of sub-lists
    """
    result = []
    for i in range(0, len(lst), n_each_req):
        result.append(lst[i:i + n_each_req])
    return result

def map_to_json(map, language):
    dict_ = read_map_from_json(language)
    dict_.update(map)
    with open(f'docs/translate_{language.lower()}.json', 'w', encoding='utf8') as f:
        json.dump(dict_, f, indent=4, ensure_ascii=False)

def read_map_from_json(language):
    if os.path.exists(f'docs/translate_{language.lower()}.json'):
        with open(f'docs/translate_{language.lower()}.json', 'r', encoding='utf8') as f:
            res = json.load(f)
            res = {k:v for k, v in res.items() if v is not None and contains_chinese(k)}
            return res
    return {}

def advanced_split(splitted_string, spliter, include_spliter=False):
    splitted_string_tmp = []
    for string_ in splitted_string:
        if spliter in string_:
            splitted = string_.split(spliter)
            for i, s in enumerate(splitted):
                if include_spliter:
                    if i != len(splitted)-1:
                        splitted[i] += spliter
                splitted[i] = splitted[i].strip()
            for i in reversed(range(len(splitted))):
                if not contains_chinese(splitted[i]):
                    splitted.pop(i)
            splitted_string_tmp.extend(splitted)
        else:
            splitted_string_tmp.append(string_)
    splitted_string = splitted_string_tmp
    return splitted_string_tmp

cached_translation = {}
cached_translation = read_map_from_json(language=LANG)

def trans(word_to_translate, language, special=False):
    if len(word_to_translate) == 0: return {}
    from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
    from toolbox import get_conf, ChatBotWithCookies
    proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
        get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
    llm_kwargs = {
        'api_key': API_KEY,
        'llm_model': LLM_MODEL,
        'top_p':1.0,
        'max_length': None,
        'temperature':0.4,
    }
    import random
    N_EACH_REQ = random.randint(16, 32)
    word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
    inputs_array = [str(s) for s in word_to_translate_split]
    inputs_show_user_array = inputs_array
    history_array = [[] for _ in inputs_array]
    if special: #  to English using CamelCase Naming Convention
        sys_prompt_array = [f"Translate following names to English with CamelCase naming convention. Keep original format" for _ in inputs_array]
    else:
        sys_prompt_array = [f"Translate following sentences to {LANG}. E.g., You should translate sentences to the following format ['translation of sentence 1', 'translation of sentence 2']. Do NOT answer with Chinese!" for _ in inputs_array]
    chatbot = ChatBotWithCookies(llm_kwargs)
    gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
        inputs_array,
        inputs_show_user_array,
        llm_kwargs,
        chatbot,
        history_array,
        sys_prompt_array,
    )
    while True:
        try:
            gpt_say = next(gpt_say_generator)
            print(gpt_say[1][0][1])
        except StopIteration as e:
            result = e.value
            break
    translated_result = {}
    for i, r in enumerate(result):
        if i%2 == 1:
            try:
                res_before_trans = eval(result[i-1])
                res_after_trans = eval(result[i])
                if len(res_before_trans) != len(res_after_trans):
                    raise RuntimeError
                for a,b in zip(res_before_trans, res_after_trans):
                    translated_result[a] = b
            except:
                # try:
                    # res_before_trans = word_to_translate_split[(i-1)//2]
                    # res_after_trans = [s for s in result[i].split("', '")]
                #     for a,b in zip(res_before_trans, res_after_trans):
                #         translated_result[a] = b
                # except:
                print('GPT answers with unexpected format, some words may not be translated, but you can try again later to increase translation coverage.')
                res_before_trans = eval(result[i-1])
                for a in res_before_trans:
                    translated_result[a] = None
    return translated_result


def trans_json(word_to_translate, language, special=False):
    if len(word_to_translate) == 0: return {}
    from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
    from toolbox import get_conf, ChatBotWithCookies
    proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
        get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
    llm_kwargs = {
        'api_key': API_KEY,
        'llm_model': LLM_MODEL,
        'top_p':1.0,
        'max_length': None,
        'temperature':0.1,
    }
    import random
    N_EACH_REQ = random.randint(16, 32)
    random.shuffle(word_to_translate)
    word_to_translate_split = split_list(word_to_translate, N_EACH_REQ)
    inputs_array = [{k:"#" for k in s} for s in word_to_translate_split]
    inputs_array = [ json.dumps(i, ensure_ascii=False)  for i in inputs_array]

    inputs_show_user_array = inputs_array
    history_array = [[] for _ in inputs_array]
    sys_prompt_array = [TransPrompt for _ in inputs_array]
    chatbot = ChatBotWithCookies(llm_kwargs)
    gpt_say_generator = request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
        inputs_array,
        inputs_show_user_array,
        llm_kwargs,
        chatbot,
        history_array,
        sys_prompt_array,
    )
    while True:
        try:
            gpt_say = next(gpt_say_generator)
            print(gpt_say[1][0][1])
        except StopIteration as e:
            result = e.value
            break
    translated_result = {}
    for i, r in enumerate(result):
        if i%2 == 1:
            try:
                translated_result.update(json.loads(result[i]))
            except:
                print(result[i])
    print(result)
    return translated_result


def step_1_core_key_translate():
    LANG_STD = 'std'
    def extract_chinese_characters(file_path):
        syntax = []
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            import ast
            root = ast.parse(content)
            for node in ast.walk(root):
                if isinstance(node, ast.Name):
                    if contains_chinese(node.id): syntax.append(node.id)
                if isinstance(node, ast.Import):
                    for n in node.names:
                        if contains_chinese(n.name): syntax.append(n.name)
                elif isinstance(node, ast.ImportFrom):
                    for n in node.names:
                        if contains_chinese(n.name): syntax.append(n.name)
                        # if node.module is None: print(node.module)
                        for k in node.module.split('.'):
                            if contains_chinese(k): syntax.append(k)
            return syntax

    def extract_chinese_characters_from_directory(directory_path):
        chinese_characters = []
        for root, dirs, files in os.walk(directory_path):
            if any([b in root for b in blacklist]):
                continue
            print(files)
            for file in files:
                if file.endswith('.py'):
                    file_path = os.path.join(root, file)
                    chinese_characters.extend(extract_chinese_characters(file_path))
        return chinese_characters

    directory_path = './'
    chinese_core_names = extract_chinese_characters_from_directory(directory_path)
    chinese_core_keys = [name for name in chinese_core_names]
    chinese_core_keys_norepeat = []
    for d in chinese_core_keys:
        if d not in chinese_core_keys_norepeat: chinese_core_keys_norepeat.append(d)
    need_translate = []
    cached_translation = read_map_from_json(language=LANG_STD)
    cached_translation_keys = list(cached_translation.keys())
    for d in chinese_core_keys_norepeat:
        if d not in cached_translation_keys:
            need_translate.append(d)

    need_translate_mapping = trans(need_translate, language=LANG_STD, special=True)
    map_to_json(need_translate_mapping, language=LANG_STD)
    cached_translation = read_map_from_json(language=LANG_STD)
    cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))

    chinese_core_keys_norepeat_mapping = {}
    for k in chinese_core_keys_norepeat:
        chinese_core_keys_norepeat_mapping.update({k:cached_translation[k]})
    chinese_core_keys_norepeat_mapping = dict(sorted(chinese_core_keys_norepeat_mapping.items(), key=lambda x: -len(x[0])))

    # ===============================================
    # copy
    # ===============================================
    def copy_source_code():

        from toolbox import get_conf
        import shutil
        import os
        try: shutil.rmtree(f'./multi-language/{LANG}/')
        except: pass
        os.makedirs(f'./multi-language', exist_ok=True)
        backup_dir = f'./multi-language/{LANG}/'
        shutil.copytree('./', backup_dir, ignore=lambda x, y: blacklist)
    copy_source_code()

    # ===============================================
    # primary key replace
    # ===============================================
    directory_path = f'./multi-language/{LANG}/'
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                syntax = []
                # read again
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                for k, v in chinese_core_keys_norepeat_mapping.items():
                    content = content.replace(k, v)

                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)


def step_2_core_key_translate():

    # =================================================================================================
    # step2
    # =================================================================================================

    def load_string(strings, string_input):
        string_ = string_input.strip().strip(',').strip().strip('.').strip()
        if string_.startswith('[Local Message]'):
            string_ = string_.replace('[Local Message]', '')
            string_ = string_.strip().strip(',').strip().strip('.').strip()
        splitted_string = [string_]
        # --------------------------------------
        splitted_string = advanced_split(splitted_string, spliter="，", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="。", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="）", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="（", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="(", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter=")", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="<", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter=">", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="[", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="]", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="【", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="】", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="？", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="：", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter=":", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter=",", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="#", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="\n", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter=";", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="`", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="   ", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="- ", include_spliter=False)
        splitted_string = advanced_split(splitted_string, spliter="---", include_spliter=False)

        # --------------------------------------
        for j, s in enumerate(splitted_string): # .com
            if '.com' in s: continue
            if '\'' in s: continue
            if '\"' in s: continue
            strings.append([s,0])


    def get_strings(node):
        strings = []
        # recursively traverse the AST
        for child in ast.iter_child_nodes(node):
            node = child
            if isinstance(child, ast.Str):
                if contains_chinese(child.s):
                    load_string(strings=strings, string_input=child.s)
            elif isinstance(child, ast.AST):
                strings.extend(get_strings(child))
        return strings

    string_literals = []
    directory_path = f'./multi-language/{LANG}/'
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                syntax = []
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # comments
                    comments_arr = []
                    for code_sp in content.splitlines():
                        comments = re.findall(r'#.*$', code_sp)
                        for comment in comments:
                            load_string(strings=comments_arr, string_input=comment)
                    string_literals.extend(comments_arr)

                    # strings
                    import ast
                    tree = ast.parse(content)
                    res = get_strings(tree, )
                    string_literals.extend(res)

    [print(s) for s in string_literals]
    chinese_literal_names = []
    chinese_literal_names_norepeat = []
    for string, offset in string_literals:
        chinese_literal_names.append(string)
    chinese_literal_names_norepeat = []
    for d in chinese_literal_names:
        if d not in chinese_literal_names_norepeat: chinese_literal_names_norepeat.append(d)
    need_translate = []
    cached_translation = read_map_from_json(language=LANG)
    cached_translation_keys = list(cached_translation.keys())
    for d in chinese_literal_names_norepeat:
        if d not in cached_translation_keys:
            need_translate.append(d)


    up = trans_json(need_translate, language=LANG, special=False)
    map_to_json(up, language=LANG)
    cached_translation = read_map_from_json(language=LANG)
    LANG_STD = 'std'
    cached_translation.update(read_map_from_json(language=LANG_STD))
    cached_translation = dict(sorted(cached_translation.items(), key=lambda x: -len(x[0])))

    # ===============================================
    # literal key replace
    # ===============================================
    directory_path = f'./multi-language/{LANG}/'
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                syntax = []
                # read again
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                for k, v in cached_translation.items():
                    if v is None: continue
                    if '"' in v:
                        v = v.replace('"', "`")
                    if '\'' in v:
                        v = v.replace('\'', "`")
                    content = content.replace(k, v)

                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)

                if file.strip('.py') in cached_translation:
                    file_new = cached_translation[file.strip('.py')] + '.py'
                    file_path_new = os.path.join(root, file_new)
                    with open(file_path_new, 'w', encoding='utf-8') as f:
                        f.write(content)
                    os.remove(file_path)
step_1_core_key_translate()
step_2_core_key_translate()
print('Finished, checkout generated results at ./multi-language/')