From 7f5be93c1db085e99d82772527c805a44e9c2e95 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Mon, 5 Jun 2023 22:57:39 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E4=B8=80=E4=BA=9B=E6=AD=A3?= =?UTF-8?q?=E5=88=99=E5=8C=B9=E9=85=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/Latex输出PDF结果.py | 14 +- crazy_functions/crazy_functions_test.py | 10 +- crazy_functions/latex_utils.py | 274 ++++++++++++------------ 3 files changed, 150 insertions(+), 148 deletions(-) diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index daac763..4cae80a 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -65,8 +65,10 @@ def move_project(project_folder, arxiv_id=None): new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') else: new_workfolder = f'gpt_log/{gen_time_str()}' - try: shutil.rmtree(new_workfolder) - except: pass + try: + shutil.rmtree(new_workfolder) + except: + pass shutil.copytree(src=project_folder, dst=new_workfolder) return new_workfolder @@ -134,7 +136,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo try: import glob, os, time os.system(f'pdflatex -version') - from .latex_utils import Latex精细分解与转化, 编译Latex差别 + from .latex_utils import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) @@ -172,7 +174,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo # <-------------- compile PDF -------------> - success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) @@ -204,7 +206,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, try: import glob, os, time os.system(f'pdflatex -version') - from .latex_utils import Latex精细分解与转化, 编译Latex差别 + from .latex_utils import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) @@ -247,7 +249,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- compile PDF -------------> - success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) # <-------------- zip PDF -------------> diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index d4e3274..21c6713 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -186,12 +186,12 @@ def test_Latex(): txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/papery" txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-14-57-06" txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-15-40-20" - txt = r"https://arxiv.org/abs/1902.03185" txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40" - txt = r"https://arxiv.org/abs/2305.18290" - txt = r"https://arxiv.org/abs/2305.17608" - # txt = r"https://arxiv.org/abs/2306.00324" - txt = r"https://arxiv.org/abs/2211.16068" + txt = r"https://arxiv.org/abs/1902.03185" + # txt = r"https://arxiv.org/abs/2305.18290" + # txt = r"https://arxiv.org/abs/2305.17608" + # txt = r"https://arxiv.org/abs/2211.16068" # ACE + # txt = r"C:\Users\fuqingxu\arxiv_cache\2211.16068\workfolder" # ACE for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 9b8d873..4941354 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -2,8 +2,76 @@ from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界 from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone import os, shutil import re +import numpy as np pj = os.path.join +""" +======================================================================== +第一部分 +Latex 文件切分到一个链表中 +======================================================================== +""" +PRESERVE = 0 +TRANSFORM = 1 + +def split_worker(text, mask, pattern, flags=0): + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + mask[res.span()[0]:res.span()[1]] = PRESERVE + return text, mask + +def split_worker_reverse_caption(text, mask, pattern, flags=0): + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + mask[res.regs[1][0]:res.regs[1][1]] = TRANSFORM + return text, mask + +def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=25): + pattern_compile = re.compile(pattern, flags) + def search_with_line_limit(text, mask): + for res in pattern_compile.finditer(text): + cmd = res.group(1) # begin{what} + this = res.group(2) # content between begin and end + this_mask = mask[res.regs[2][0]:res.regs[2][1]] + white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', + 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] + if (cmd in white_list) or this.count('\n') >= 42: # use a magical number 42 + this, this_mask = search_with_line_limit(this, this_mask) + mask[res.regs[2][0]:res.regs[2][1]] = this_mask + else: + mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE + return text, mask + return search_with_line_limit(text, mask) + +class LinkedListNode(): + """ + 链表单元 + """ + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + self.begin_line = 0 + self.begin_char = 0 + +def convert_to_linklist(text, mask): + root = LinkedListNode("", preserve=True) + current_node = root + for c, m, i in zip(text, mask, range(len(text))): + if (m==PRESERVE and current_node.preserve) \ + or (m==TRANSFORM and not current_node.preserve): + # add + current_node.string += c + else: + current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) + current_node = current_node.next + return root +""" +======================================================================== +Latex 文件融合 +======================================================================== +""" + def 寻找Latex主文件(file_manifest, mode): """ 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 @@ -71,19 +139,15 @@ def merge_tex_files(project_foler, main_file, mode): return main_file -class LinkedListNode(): - """ - 链表单元 - """ - def __init__(self, string, preserve=True) -> None: - self.string = string - self.preserve = preserve - self.next = None - +""" +======================================================================== +后处理 +======================================================================== +""" def mod_inbraket(match): """ - 为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹 + 为啥chatgpt会把cite里面的逗号换成中文逗号呀 """ # get the matched string cmd = match.group(1) @@ -98,19 +162,24 @@ def fix_content(final_tex, node_string): """ Fix common GPT errors to increase success rate """ - final_tex = final_tex.replace('%', r'\%') - final_tex = final_tex.replace(r'\%', r'\\%') + final_tex = re.sub(r"(? 0 and node_string.count('\_') > final_tex.count('\_'): + # walk and replace any _ without \ + final_tex = re.sub(r"(? 25: - sub_res = search_with_line_limit(this) - if not sub_res: continue - else: return sub_res - else: - return res.group(0) - return False - # ====== - # search for first encounter of \begin \end pair with less than 25 lines in the middle - ps = search_with_line_limit(target_string) - if not ps: break - res = re.search(re.escape(ps), target_string, flags) - if not res: assert False - before = res.string[:res.span()[0]] - this = res.group(0) - after = res.string[res.span()[1]:] - # ====== - lt.string = before - tmp = lt.next - # ====== - mid = LinkedListNode(this, True) - lt.next = mid - # ====== - aft = LinkedListNode(after, False) - mid.next = aft - aft.next = tmp - # ====== - lt = aft - lt = lt.next - cnt += 1 - # print(cnt) - if lt is None: break - - - # root 是链表的头 - print('正在分解Latex源文件,构建链表结构') + # 吸收title与作者以上的部分 + text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) # 删除iffalse注释 - split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL) + text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) # 吸收在25行以内的begin-end组合 - split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) + text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) # 吸收匿名公式 - split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL) + text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) # 吸收其他杂项 - split_worker(root, r"(.*?)\\maketitle", re.DOTALL) - split_worker(root, r"\\section\{(.*?)\}") - split_worker(root, r"\\section\*\{(.*?)\}") - split_worker(root, r"\\subsection\{(.*?)\}") - split_worker(root, r"\\subsubsection\{(.*?)\}") - split_worker(root, r"\\bibliography\{(.*?)\}") - split_worker(root, r"\\bibliographystyle\{(.*?)\}") - split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - split_worker(root, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) - split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) - split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) - split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) - split_worker(root, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) - split_worker(root, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) - split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) - split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) - split_worker(root, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) - split_worker(root, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) - split_worker(root, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) - split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) - split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) - split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - split_worker(root, r"\\item ") - split_worker(root, r"\\label\{(.*?)\}") - split_worker(root, r"\\begin\{(.*?)\}") - split_worker(root, r"\\vspace\{(.*?)\}") - split_worker(root, r"\\hspace\{(.*?)\}") - split_worker(root, r"\\end\{(.*?)\}") - - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<50: node.preserve = True - node = node.next - if node is None: break - + text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\item ") + text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") + # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) + root = convert_to_linklist(text, mask) + # 将分解结果返回 res_to_t + with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: + res_to_t = [] + node = root + while True: + show_html = node.string.replace('\n','
') + if not node.preserve: + res_to_t.append(node.string) + f.write(f'

#{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break # 修复括号 node = root while True: @@ -295,7 +295,7 @@ class LatexPaperSplit(): str_stack.append('{') elif c == '}': if len(str_stack) == 1: - print('stack kill') + print('stack fix') return i str_stack.pop(-1) else: @@ -322,7 +322,7 @@ class LatexPaperSplit(): node = root while True: if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<50: node.preserve = True + if len(node.string.strip('\n').strip(''))<42: node.preserve = True node = node.next if node is None: break @@ -418,7 +418,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin maintex = 寻找Latex主文件(file_manifest, mode) chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - time.sleep(5) + time.sleep(3) # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> main_tex_basename = os.path.basename(maintex) @@ -529,7 +529,7 @@ def compile_latex_with_timeout(command, timeout=60): return False return True -def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): +def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): import os, time current_dir = os.getcwd() n_fix = 1