From f38929b14981dd08fb94b5af31afdc9042ce09d0 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Mon, 10 Apr 2023 00:29:53 +0800 Subject: [PATCH] =?UTF-8?q?+Latex=E5=85=A8=E6=96=87=E4=B8=AD=E8=8B=B1?= =?UTF-8?q?=E4=BA=92=E8=AF=91=E6=8F=92=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functional.py | 17 +++ crazy_functions/Latex全文翻译.py | 176 +++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 crazy_functions/Latex全文翻译.py diff --git a/crazy_functional.py b/crazy_functional.py index 4746804..ee40a21 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -78,6 +78,8 @@ def get_crazy_functions(): from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入 from crazy_functions.Latex全文润色 import Latex英文润色 from crazy_functions.Latex全文润色 import Latex中文润色 + from crazy_functions.Latex全文翻译 import Latex中译英 + from crazy_functions.Latex全文翻译 import Latex英译中 function_plugins.update({ "批量翻译PDF文档(多线程)": { @@ -128,6 +130,21 @@ def get_crazy_functions(): "AsButton": False, # 加入下拉菜单中 "Function": HotReload(Latex中文润色) }, + + "Latex项目全文中译英(输入路径或上传压缩包)": { + # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效 + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Function": HotReload(Latex中译英) + }, + "Latex项目全文英译中(输入路径或上传压缩包)": { + # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效 + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Function": HotReload(Latex英译中) + }, + + }) ###################### 第三组插件 ########################### diff --git a/crazy_functions/Latex全文翻译.py b/crazy_functions/Latex全文翻译.py new file mode 100644 index 0000000..8d2fdb7 --- /dev/null +++ b/crazy_functions/Latex全文翻译.py @@ -0,0 +1,176 @@ +from toolbox import update_ui +from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down +fast_debug = False + +class PaperFileGroup(): + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + import tiktoken + from toolbox import get_conf + enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) + def get_token_num(txt): return len(enc.encode(txt)) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + 将长文本分离开来 + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + + print('Segmentation: done') + +def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + + # <-------- 读取Latex文件,删除其中的所有注释 ----------> + pfg = PaperFileGroup() + + for index, fp in enumerate(file_manifest): + with open(fp, 'r', encoding='utf-8') as f: + file_content = f.read() + # 定义注释的正则表达式 + comment_pattern = r'%.*' + # 使用正则表达式查找注释,并替换为空字符串 + clean_tex_content = re.sub(comment_pattern, '', file_content) + # 记录删除注释后的文本 + pfg.file_paths.append(fp) + pfg.file_contents.append(clean_tex_content) + + # <-------- 拆分过长的latex文件 ----------> + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + # <-------- 抽取摘要 ----------> + # if language == 'en': + # abs_extract_inputs = f"Please write an abstract for this paper" + + # # 单线,获取文章meta信息 + # paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( + # inputs=abs_extract_inputs, + # inputs_show_user=f"正在抽取摘要信息。", + # llm_kwargs=llm_kwargs, + # chatbot=chatbot, history=[], + # sys_prompt="Your job is to collect information from materials。", + # ) + + # <-------- 多线程润色开始 ----------> + if language == 'en->zh': + inputs_array = ["Below is a section from an English academic paper, translate it into Chinese, do not modify any latex command such as \section, \cite and equations:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] + sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + elif language == 'zh->en': + inputs_array = [f"Below is a section from a Chinese academic paper, translate it into English, do not modify any latex command such as \section, \cite and equations:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag] + sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + max_workers=10, # OpenAI所允许的最大并行过载 + scroller_max_len = 80 + ) + + # <-------- 整理结果,退出 ----------> + create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" + res = write_results_to_file(gpt_response_collection, file_name=create_report_file_name) + history = gpt_response_collection + chatbot.append((f"{fp}完成了吗?", res)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + + + + +@CatchException +def Latex英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_execption(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en->zh') + + + + + +@CatchException +def Latex中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_execption(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh->en') \ No newline at end of file