From 7f5be93c1db085e99d82772527c805a44e9c2e95 Mon Sep 17 00:00:00 2001
From: qingxu fu <505030475@qq.com>
Date: Mon, 5 Jun 2023 22:57:39 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E4=B8=80=E4=BA=9B=E6=AD=A3?=
 =?UTF-8?q?=E5=88=99=E5=8C=B9=E9=85=8Dbug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crazy_functions/Latex输出PDF结果.py     |  14 +-
 crazy_functions/crazy_functions_test.py |  10 +-
 crazy_functions/latex_utils.py          | 274 ++++++++++++------------
 3 files changed, 150 insertions(+), 148 deletions(-)

diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py
index daac763..4cae80a 100644
--- a/crazy_functions/Latex输出PDF结果.py
+++ b/crazy_functions/Latex输出PDF结果.py
@@ -65,8 +65,10 @@ def move_project(project_folder, arxiv_id=None):
         new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
     else:
         new_workfolder = f'gpt_log/{gen_time_str()}'
-    try: shutil.rmtree(new_workfolder)
-    except: pass
+    try:
+        shutil.rmtree(new_workfolder)
+    except:
+        pass
     shutil.copytree(src=project_folder, dst=new_workfolder)
     return new_workfolder
 
@@ -134,7 +136,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
     try:
         import glob, os, time
         os.system(f'pdflatex -version')
-        from .latex_utils import Latex精细分解与转化, 编译Latex差别
+        from .latex_utils import Latex精细分解与转化, 编译Latex
     except Exception as e:
         chatbot.append([ f"解析项目: {txt}",
             f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
@@ -172,7 +174,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
 
 
     # <-------------- compile PDF ------------->
-    success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', 
+    success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread', 
                              work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
     
 
@@ -204,7 +206,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
     try:
         import glob, os, time
         os.system(f'pdflatex -version')
-        from .latex_utils import Latex精细分解与转化, 编译Latex差别
+        from .latex_utils import Latex精细分解与转化, 编译Latex
     except Exception as e:
         chatbot.append([ f"解析项目: {txt}",
             f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
@@ -247,7 +249,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
 
 
     # <-------------- compile PDF ------------->
-    success = yield from 编译Latex差别(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', 
+    success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', 
                              work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
 
     # <-------------- zip PDF ------------->
diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py
index d4e3274..21c6713 100644
--- a/crazy_functions/crazy_functions_test.py
+++ b/crazy_functions/crazy_functions_test.py
@@ -186,12 +186,12 @@ def test_Latex():
     txt = "C:/Users/fuqingxu/Desktop/旧文件/gpt/papery"
     txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-14-57-06"
     txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-15-40-20"
-    txt = r"https://arxiv.org/abs/1902.03185"
     txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40"
-    txt = r"https://arxiv.org/abs/2305.18290"
-    txt = r"https://arxiv.org/abs/2305.17608"
-    # txt = r"https://arxiv.org/abs/2306.00324"
-    txt = r"https://arxiv.org/abs/2211.16068"
+    txt = r"https://arxiv.org/abs/1902.03185"
+    # txt = r"https://arxiv.org/abs/2305.18290"
+    # txt = r"https://arxiv.org/abs/2305.17608"
+    # txt = r"https://arxiv.org/abs/2211.16068"   #  ACE
+    # txt = r"C:\Users\fuqingxu\arxiv_cache\2211.16068\workfolder"   #  ACE
     
     for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
         cli_printer.print(cb)   #  print(cb)
diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py
index 9b8d873..4941354 100644
--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@@ -2,8 +2,76 @@ from toolbox import update_ui, update_ui_lastest_msg    # 刷新Gradio前端界
 from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
 import os, shutil
 import re
+import numpy as np
 pj = os.path.join
 
+"""
+========================================================================
+第一部分
+Latex 文件切分到一个链表中
+========================================================================
+"""
+PRESERVE = 0
+TRANSFORM = 1
+
+def split_worker(text, mask, pattern, flags=0):
+    pattern_compile = re.compile(pattern, flags)
+    for res in pattern_compile.finditer(text):
+        mask[res.span()[0]:res.span()[1]] = PRESERVE
+    return text, mask
+
+def split_worker_reverse_caption(text, mask, pattern, flags=0):
+    pattern_compile = re.compile(pattern, flags)
+    for res in pattern_compile.finditer(text):
+        mask[res.regs[1][0]:res.regs[1][1]] = TRANSFORM
+    return text, mask
+
+def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=25):
+    pattern_compile = re.compile(pattern, flags)
+    def search_with_line_limit(text, mask):
+        for res in pattern_compile.finditer(text):
+            cmd = res.group(1)  # begin{what}
+            this = res.group(2) # content between begin and end
+            this_mask = mask[res.regs[2][0]:res.regs[2][1]]
+            white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 
+                          'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate']
+            if (cmd in white_list) or this.count('\n') >= 42: # use a magical number 42
+                this, this_mask = search_with_line_limit(this, this_mask)
+                mask[res.regs[2][0]:res.regs[2][1]] = this_mask
+            else:
+                mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE
+        return text, mask
+    return search_with_line_limit(text, mask) 
+
+class LinkedListNode():
+    """
+    链表单元
+    """
+    def __init__(self, string, preserve=True) -> None:
+        self.string = string
+        self.preserve = preserve
+        self.next = None
+        self.begin_line = 0
+        self.begin_char = 0
+
+def convert_to_linklist(text, mask):
+    root = LinkedListNode("", preserve=True)
+    current_node = root
+    for c, m, i in zip(text, mask, range(len(text))):
+        if (m==PRESERVE and current_node.preserve) \
+            or (m==TRANSFORM and not current_node.preserve):
+            # add
+            current_node.string += c
+        else:
+            current_node.next = LinkedListNode(c, preserve=(m==PRESERVE))
+            current_node = current_node.next
+    return root
+"""
+========================================================================
+Latex 文件融合
+========================================================================
+"""
+
 def 寻找Latex主文件(file_manifest, mode):
     """
     在多Tex文档中，寻找主文件，必须包含documentclass，返回找到的第一个。
@@ -71,19 +139,15 @@ def merge_tex_files(project_foler, main_file, mode):
     return main_file
 
 
-class LinkedListNode():
-    """
-    链表单元
-    """
-    def __init__(self, string, preserve=True) -> None:
-        self.string = string
-        self.preserve = preserve
-        self.next = None
-
 
+"""
+========================================================================
+后处理
+========================================================================
+"""
 def mod_inbraket(match):
     """
-    为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹
+    为啥chatgpt会把cite里面的逗号换成中文逗号呀 
     """
     # get the matched string
     cmd = match.group(1)
@@ -98,19 +162,24 @@ def fix_content(final_tex, node_string):
     """
     Fix common GPT errors to increase success rate
     """
-    final_tex = final_tex.replace('%', r'\%')
-    final_tex = final_tex.replace(r'\%', r'\\%')
+    final_tex = re.sub(r"(?<!\\)%", "\\%", final_tex)
     final_tex = re.sub(r"\\([a-z]{2,10})\ \{", r"\\\1{", string=final_tex)
     final_tex = re.sub(r"\\\ ([a-z]{2,10})\{", r"\\\1{", string=final_tex)
     final_tex = re.sub(r"\\([a-z]{2,10})\{([^\}]*?)\}", mod_inbraket, string=final_tex)
+
+    if node_string.count('\\begin') != final_tex.count('\\begin'):
+        final_tex = node_string # 出问题了，还原原文
+    if node_string.count('\_') > 0 and node_string.count('\_') > final_tex.count('\_'):
+        # walk and replace any _ without \
+        final_tex = re.sub(r"(?<!\\)_", "\\_", final_tex)
     if node_string.count('{') != node_string.count('}'):
         if final_tex.count('{') != node_string.count('{'):
             final_tex = node_string # 出问题了，还原原文
         if final_tex.count('}') != node_string.count('}'):
             final_tex = node_string # 出问题了，还原原文
-
     return final_tex
 
+
 class LatexPaperSplit():
     """
     将Latex文档分解到一个链表中，每个链表节点用preserve的标志位提示它是否应当被GPT处理
@@ -155,131 +224,62 @@ class LatexPaperSplit():
         """
         将Latex文档分解到一个链表中，每个链表节点用preserve的标志位提示它是否应当被GPT处理
         """
-        root = LinkedListNode(txt, False)
-        def split_worker(root, pattern, flags=0):
-            lt = root
-            cnt = 0
-            pattern_compile = re.compile(pattern, flags)
-            while True:
-                if not lt.preserve:
-                    while True:
-                        res = pattern_compile.search(lt.string)
-                        if not res: break
-                        before = res.string[:res.span()[0]]
-                        this = res.group(0)
-                        after = res.string[res.span()[1]:]
-                        # ======
-                        lt.string = before
-                        tmp  = lt.next
-                        # ======
-                        mid = LinkedListNode(this, True)
-                        lt.next = mid
-                        # ======
-                        aft = LinkedListNode(after, False)
-                        mid.next = aft
-                        aft.next = tmp
-                        # ======
-                        lt = aft
-                lt = lt.next
-                cnt += 1
-                # print(cnt)
-                if lt is None: break
+        text = txt
+        mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM
 
-        def split_worker_begin_end(root, pattern, flags=0, limit_n_lines=25):
-            lt = root
-            cnt = 0
-            pattern_compile = re.compile(pattern, flags)
-            while True:
-                if not lt.preserve:
-                    while True:
-                        target_string = lt.string
-
-                        def search_with_line_limit(target_string):
-                            for res in pattern_compile.finditer(target_string):
-                                cmd = res.group(1) # begin{what}
-                                this = res.group(2) # content between begin and end
-                                white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf']
-                                if cmd in white_list or this.count('\n') > 25:
-                                    sub_res = search_with_line_limit(this)
-                                    if not sub_res: continue
-                                    else: return sub_res
-                                else:
-                                    return res.group(0)
-                            return False
-                        # ======
-                        # search for first encounter of \begin \end pair with less than 25 lines in the middle
-                        ps = search_with_line_limit(target_string) 
-                        if not ps: break
-                        res = re.search(re.escape(ps), target_string, flags)
-                        if not res: assert False
-                        before = res.string[:res.span()[0]]
-                        this = res.group(0)
-                        after = res.string[res.span()[1]:]
-                        # ======
-                        lt.string = before
-                        tmp  = lt.next
-                        # ======
-                        mid = LinkedListNode(this, True)
-                        lt.next = mid
-                        # ======
-                        aft = LinkedListNode(after, False)
-                        mid.next = aft
-                        aft.next = tmp
-                        # ======
-                        lt = aft
-                lt = lt.next
-                cnt += 1
-                # print(cnt)
-                if lt is None: break
-
-
-        # root 是链表的头
-        print('正在分解Latex源文件，构建链表结构')
+        # 吸收title与作者以上的部分
+        text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL)
         # 删除iffalse注释
-        split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
         # 吸收在25行以内的begin-end组合
-        split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
+        text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
         # 吸收匿名公式
-        split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL)
         # 吸收其他杂项
-        split_worker(root, r"(.*?)\\maketitle", re.DOTALL)
-        split_worker(root, r"\\section\{(.*?)\}")
-        split_worker(root, r"\\section\*\{(.*?)\}")
-        split_worker(root, r"\\subsection\{(.*?)\}")
-        split_worker(root, r"\\subsubsection\{(.*?)\}")
-        split_worker(root, r"\\bibliography\{(.*?)\}")
-        split_worker(root, r"\\bibliographystyle\{(.*?)\}")
-        split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
-        split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
-        split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
-        split_worker(root, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
-        split_worker(root, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
-        split_worker(root, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
-        split_worker(root, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
-        split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
-        split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
-        split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
-        split_worker(root, r"\\item ")
-        split_worker(root, r"\\label\{(.*?)\}")
-        split_worker(root, r"\\begin\{(.*?)\}")
-        split_worker(root, r"\\vspace\{(.*?)\}")
-        split_worker(root, r"\\hspace\{(.*?)\}")
-        split_worker(root, r"\\end\{(.*?)\}")
-
-        node = root
-        while True:
-            if len(node.string.strip('\n').strip(''))==0: node.preserve = True
-            if len(node.string.strip('\n').strip(''))<50: node.preserve = True
-            node = node.next
-            if node is None: break
-
+        text, mask = split_worker(text, mask, r"\\section\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
+        text, mask = split_worker(text, mask, r"\\item ")
+        text, mask = split_worker(text, mask, r"\\label\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}")
+        text, mask = split_worker(text, mask, r"\\end\{(.*?)\}")
+        # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL)
+        root = convert_to_linklist(text, mask)
+        # 将分解结果返回 res_to_t
+        with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
+            res_to_t = []
+            node = root
+            while True:
+                show_html = node.string.replace('\n','<br/>')
+                if not node.preserve:
+                    res_to_t.append(node.string)
+                    f.write(f'<p style="color:black;">#{show_html}#</p>')
+                else:
+                    f.write(f'<p style="color:red;">{show_html}</p>')
+                node = node.next
+                if node is None: break
         # 修复括号
         node = root
         while True:
@@ -295,7 +295,7 @@ class LatexPaperSplit():
                         str_stack.append('{')
                     elif c == '}':
                         if len(str_stack) == 1:
-                            print('stack kill')
+                            print('stack fix')
                             return i
                         str_stack.pop(-1)
                     else:
@@ -322,7 +322,7 @@ class LatexPaperSplit():
         node = root
         while True:
             if len(node.string.strip('\n').strip(''))==0: node.preserve = True
-            if len(node.string.strip('\n').strip(''))<50: node.preserve = True
+            if len(node.string.strip('\n').strip(''))<42: node.preserve = True
             node = node.next
             if node is None: break
 
@@ -418,7 +418,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
     maintex = 寻找Latex主文件(file_manifest, mode)
     chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果：该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。'))
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    time.sleep(5)
+    time.sleep(3)
 
     #  <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> 
     main_tex_basename = os.path.basename(maintex)
@@ -529,7 +529,7 @@ def compile_latex_with_timeout(command, timeout=60):
         return False
     return True
 
-def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
+def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
     import os, time
     current_dir = os.getcwd()
     n_fix = 1