diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index 50620b4..b93826f 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -1,11 +1,106 @@ from toolbox import update_ui, trimmed_format_exc from toolbox import CatchException, report_execption, write_results_to_file, zip_folder -import glob +import glob, copy, os + + +def confirm_answer_is_health(bufo, buf, llm_kwargs, default = True): + # from request_llm.bridge_all import predict_no_ui_long_connection + # inputs = f"I asked someone to proofread some text \"{bufo}\", this is what he answered: \"{buf}\"." + # inputs += "Did he answer me with proofreaded text (`true`), or did he just tell me the text has no errors (`false`)?" + + # llm_kwargs_ = copy.deepcopy(llm_kwargs); llm_kwargs_['temperature'] = 0 + # result = predict_no_ui_long_connection( inputs=inputs, llm_kwargs=llm_kwargs_, + # history=[], sys_prompt="Answer my question with either `true` or `false`.", observe_window=[]) + + # if 'false' in result or 'FALSE' in result or 'False' in result: + # return False + + # if 'true' in result or 'TRUE' in result or 'True' in result: + # return True + + # return default + return len(buf) >= len(bufo) // 3 + +def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit + + # <-------- 读取Latex文件,删除其中的所有注释 ----------> + def 寻找主文件(file_manifest): + for texf in file_manifest: + with open(texf, 'r', encoding='utf8') as f: + file_content = f.read() + if r'\documentclass' in file_content: + return texf + else: + continue + raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') + + + + maintex = 寻找主文件(file_manifest) + with open(maintex, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + merged_content = merge_tex_files(project_folder, content) + # 使用正则表达式查找注释,并替换为空字符串 + merged_content = re.sub(r'(? + pfg = LatexPaperFileGroup() + for index, r in enumerate(res): + pfg.file_paths.append(index) + pfg.file_contents.append(r) + + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + inputs_array = [r"Below is a section from an academic paper, proofread this section." + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + + r"Answer me only with the revised text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] + inputs_show_user_array = [f"Proofread {f}" for f in pfg.sp_file_tag] + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # 并行任务数量限制,最多同时执行5个,其他的排队等待 + scroller_max_len = 80 + ) + + # <-------- 文本碎片重组为完整的tex片段 ----------> + pfg.sp_file_result = [] + for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): + pfg.sp_file_result.append(gpt_say) + pfg.merge_result() + + final_tex = lps.merge_result(pfg.sp_file_result) + with open(project_folder + '/merge_proofread.tex', 'w', encoding='utf-8', errors='replace') as f: + f.write(final_tex) + # <-------- 整理结果,退出 ----------> + create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" + res = write_results_to_file(gpt_response_collection, file_name=create_report_file_name) + history = gpt_response_collection + chatbot.append((f"完成了吗?", res)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return project_folder + '/merge_proofread.tex' + + def 编译Latex(main_tex, work_folder): import os current_dir = os.getcwd() - os.chdir(work_folder); + os.chdir(work_folder) main_file = os.path.basename(main_tex) assert main_file.endswith('.tex') main_file = main_file[:-4] @@ -39,3 +134,102 @@ def Latex预处理(tar_file): +@CatchException +def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行纠错,用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import glob, os + os.system(f'pdflatex -version') + except Exception as e: + print(trimmed_format_exc()) + report_execption(chatbot, history, a=f"解析项目: {txt}", + b=f"尝试执行Latex指令失败。Latex没有安装,或者不在环境变量PATH中。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + history = [] # 清空历史,以免输入溢出 + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + if not os.path.exists(project_folder + '/merge_proofread.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread_latex') + + res_pdf_path = 编译Latex差别(main_file_original='merge', main_file_modified='merge_proofread', + work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) + return res_pdf_path + + +def 编译Latex差别(main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): + import os + current_dir = os.getcwd() + # <---------------------> + + import os, shutil + + # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error + os.chdir(work_folder_original); os.system(f'pdflatex -interaction=batchmode {main_file_original}.tex'); os.chdir(current_dir) + os.chdir(work_folder_modified); os.system(f'pdflatex -interaction=batchmode {main_file_modified}.tex'); os.chdir(current_dir) + os.chdir(work_folder_original); os.system(f'bibtex {main_file_original}.aux'); os.chdir(current_dir) + os.chdir(work_folder_modified); os.system(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir) + + print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/diff.tex') + os.system(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/diff.tex') + + os.chdir(work_folder); os.system(f'pdflatex -interaction=batchmode diff.tex'); os.chdir(current_dir) + os.chdir(work_folder); os.system(f'bibtex diff.aux'); os.chdir(current_dir) + os.chdir(work_folder); os.system(f'pdflatex -interaction=batchmode diff.tex'); os.chdir(current_dir) + os.chdir(work_folder); os.system(f'pdflatex -interaction=batchmode diff.tex'); os.chdir(current_dir) + + + # <---------------------> + os.chdir(current_dir) + return f'{work_folder}/diff.pdf' + + +def Latex预处理(pfg, project_folder): + import shutil, os + work_folder = 'private_upload/latex_workshop_temp' + + try: + shutil.rmtree(work_folder) + except: + pass + finally: + work_folder_original = 'private_upload/latex_workshop_temp/original' + work_folder_modified = 'private_upload/latex_workshop_temp/modified' + shutil.copytree(project_folder, work_folder_original, ignore=lambda a,b: ['.git']) + shutil.copytree(project_folder, work_folder_modified, ignore=lambda a,b: ['.git']) + + for path, result in zip(pfg.file_paths, pfg.file_result): + path_old = os.path.relpath(path, start=project_folder) + path_new = os.path.join(work_folder_modified, path_old) + with open(path_new, 'w', encoding='utf-8') as f: + f.write(result) + + for main_file_original in glob.glob('private_upload/latex_workshop_temp/original/*.tex'): + with open(main_file_original, 'r', encoding='utf8') as f: + file_content = f.read() + if r'\documentclass' in file_content: + path_old = os.path.relpath(main_file_original, start=work_folder_original) + main_file_modified = os.path.relpath(work_folder_modified, start=work_folder_original) + return main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder + else: + continue + raise RuntimeError('无法找到一个主Tex文件, 本程序寻找主Tex文件的方法是查找文件中的documentclass关键字。') diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index 7505fb6..010f9fa 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -180,13 +180,19 @@ def test_Langchain知识库读取(): cli_printer.print(cb) # print(cb) def test_Latex(): - from crazy_functions.Latex输出PDF结果 import Latex预处理, 编译Latex - txt = "2302.02948.tar" - print(txt) - main_tex, work_folder = Latex预处理(txt) - print('main tex:', main_tex) - res = 编译Latex(main_tex, work_folder) - # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + from crazy_functions.Latex输出PDF结果 import Latex预处理, 编译Latex, Latex英文纠错加PDF对比 + txt = "C:/Users/fuqingxu/Desktop/proofread" + for cookies, cb, hist, msg in (Latex英文纠错加PDF对比)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + cli_printer.print(cb) # print(cb) + + + + # txt = "2302.02948.tar" + # print(txt) + # main_tex, work_folder = Latex预处理(txt) + # print('main tex:', main_tex) + # res = 编译Latex(main_tex, work_folder) + # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): # cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py new file mode 100644 index 0000000..52976c0 --- /dev/null +++ b/crazy_functions/latex_utils.py @@ -0,0 +1,175 @@ +from toolbox import update_ui, trimmed_format_exc +from toolbox import CatchException, report_execption, write_results_to_file, zip_folder +import os +import re + + + +def merge_tex_files(project_foler, main_file): + # Get the directory of the main tex file + + # re.findall(r"\\input\{(.*?)\}", main_file, re.M) + for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]): + f = s.group(1) + fp = os.path.join(project_foler, f) + with open(fp, 'r', encoding='utf-8', errors='replace') as fx: + c = fx.read() + c = merge_tex_files(project_foler, c) + main_file = main_file[:s.span()[0]] + c + main_file[s.span()[1]:] + + return main_file + +class LinkTable(): + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + +class LatexPaperSplit(): + def __init__(self) -> None: + self.root = None + + def merge_result(self, arr): + def remove_special_chars(s): + s.replace('%', 'Percent') + return s + + result_string = "" + + node = self.root + p = 0 + while True: + if node.preserve: + result_string += node.string + else: + result_string += remove_special_chars(arr[p]) + p += 1 + node = node.next + if node is None: break + + return result_string + + def split(self, txt): + # def replace_with_hash() + root = LinkTable(txt, False) + def split_worker(root, pattern, flags=0): + lt = root + cnt = 0 + while True: + if not lt.preserve: + while True: + res = re.search(pattern, lt.string, flags) + if not res: break + before = res.string[:res.span()[0]] + this = res.group(0) + # core = res.group(1) + after = res.string[res.span()[1]:] + + lt.string = before + tmp = lt.next + # ====== + if after.startswith('\n'): + # move \n + this = this + '\n' + after = after[1:] + mid = LinkTable(this, True) + lt.next = mid + # ====== + aft = LinkTable(after, False) + mid.next = aft + aft.next = tmp + # ====== + lt = aft + lt = lt.next + cnt += 1 + print(cnt) + if lt is None: break + + # root 是链表的头 + print('正在分解Latex源文件') + split_worker(root, r"(.*?)\\maketitle", re.DOTALL) + split_worker(root, r"\\section\{(.*?)\}") + split_worker(root, r"\\subsection\{(.*?)\}") + split_worker(root, r"\\subsubsection\{(.*?)\}") + split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) + split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) + split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) + split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) + split_worker(root, r"\\item ") + split_worker(root, r"\\begin\{(.*?)\}") + split_worker(root, r"\\end\{(.*?)\}") + + res = [] + node = root + while True: + res.append((node.string, node.preserve)) + if len(node.string.strip('\n').strip(''))==0: node.preserve = True + if len(node.string.strip('\n').strip(''))<50: node.preserve = True + node = node.next + if node is None: break + + print('======================================') + res_to_t = [] + node = root + while True: + if not node.preserve: + print(node.string) + res_to_t.append(node.string) + print('======================================') + node = node.next + if node is None: break + print('======================================') + self.root = root + self.sp = res_to_t + return self.sp + +class LatexPaperFileGroup(): + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + from request_llm.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + 将长文本分离开来 + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + print('Segmentation: done') + + def merge_result(self): + self.file_result = ["" for _ in range(len(self.file_paths))] + for r, k in zip(self.sp_file_result, self.sp_file_index): + self.file_result[k] += r + + def write_result(self): + manifest = [] + for path, res in zip(self.file_paths, self.file_result): + with open(path + '.polish.tex', 'w', encoding='utf8') as f: + manifest.append(path + '.polish.tex') + f.write(res) + return manifest + + def zip_result(self): + import os, time + folder = os.path.dirname(self.file_paths[0]) + t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + zip_folder(folder, './gpt_log/', f'{t}-polished.zip') diff --git a/toolbox.py b/toolbox.py index 4e92b4f..8bd37a2 100644 --- a/toolbox.py +++ b/toolbox.py @@ -736,6 +736,8 @@ def clip_history(inputs, history, tokenizer, max_token_limit): 其他小工具: - zip_folder: 把某个路径下所有文件压缩,然后转移到指定的另一个路径中(gpt写的) - gen_time_str: 生成时间戳 + - ProxyNetworkActivate: 临时地启动代理网络(如果有) + - objdump/objload: 快捷的调试函数 ======================================================================== """ @@ -774,7 +776,6 @@ def gen_time_str(): import time return time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) - class ProxyNetworkActivate(): """ 这段代码定义了一个名为TempProxy的空上下文管理器, 用于给一小段代码上代理 @@ -792,4 +793,17 @@ class ProxyNetworkActivate(): os.environ['no_proxy'] = '*' if 'HTTP_PROXY' in os.environ: os.environ.pop('HTTP_PROXY') if 'HTTPS_PROXY' in os.environ: os.environ.pop('HTTPS_PROXY') - return \ No newline at end of file + return + +def objdump(obj): + import pickle + with open('objdump.tmp', 'wb+') as f: + pickle.dump(obj, f) + return + +def objload(): + import pickle, os + if not os.path.exists('objdump.tmp'): + return + with open('objdump.tmp', 'rb') as f: + return pickle.load(f) \ No newline at end of file