diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 3fbbf9b..e3d5113 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -5,6 +5,10 @@ import re pj = os.path.join def 寻找Latex主文件(file_manifest, mode): + """ + 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 + P.S. 但愿没人把latex模板放在里面传进来 + """ for texf in file_manifest: if os.path.basename(texf).startswith('merge'): continue @@ -17,6 +21,9 @@ def 寻找Latex主文件(file_manifest, mode): raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') def merge_tex_files_(project_foler, main_file, mode): + """ + 递归地把多Tex工程整合为一个Tex文档 + """ for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]): f = s.group(1) fp = os.path.join(project_foler, f) @@ -33,38 +40,56 @@ def merge_tex_files_(project_foler, main_file, mode): return main_file def merge_tex_files(project_foler, main_file, mode): + """ + 递归地把多Tex工程整合为一个Tex文档(递归外层) + P.S. 顺便把CTEX塞进去以支持中文 + P.S. 顺便把Latex的注释去除 + """ main_file = merge_tex_files_(project_foler, main_file, mode) - if mode == 'translate_zh': pattern = re.compile(r'\\documentclass.*\n') match = pattern.search(main_file) position = match.end() main_file = main_file[:position] + '\\usepackage{CTEX}\n\\usepackage{url}\n' + main_file[position:] - + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + main_file = re.sub(r'(? None: self.string = string self.preserve = preserve self.next = None + def mod_inbraket(match): + """ + 为啥chatgpt会把cite里面的逗号换成中文逗号呀 艹 + """ # get the matched string cmd = match.group(1) str_to_modify = match.group(2) - # modify the matched string str_to_modify = str_to_modify.replace(':', ':') # 前面是中文冒号,后面是英文冒号 str_to_modify = str_to_modify.replace(',', ',') # 前面是中文逗号,后面是英文逗号 # str_to_modify = 'BOOM' - # return the modified string as the replacement return "\\" + cmd + "{" + str_to_modify + "}" def fix_content(final_tex, node_string): """ - fix common GPT errors to increase success rate + Fix common GPT errors to increase success rate """ final_tex = final_tex.replace('%', r'\%') final_tex = final_tex.replace(r'\%', r'\\%') @@ -74,10 +99,19 @@ def fix_content(final_tex, node_string): return final_tex class LatexPaperSplit(): + """ + 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + """ def __init__(self) -> None: + """ + root是链表的根节点 + """ self.root = None def merge_result(self, arr, mode, msg): + """ + 将GPT处理后的结果融合 + """ result_string = "" node = self.root p = 0 @@ -105,8 +139,10 @@ class LatexPaperSplit(): return result_string def split(self, txt): - # def replace_with_hash() - root = LinkTable(txt, False) + """ + 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + """ + root = LinkedListNode(txt, False) def split_worker(root, pattern, flags=0): lt = root cnt = 0 @@ -131,10 +167,10 @@ class LatexPaperSplit(): lt.string = before tmp = lt.next # ====== - mid = LinkTable(this, True) + mid = LinkedListNode(this, True) lt.next = mid # ====== - aft = LinkTable(after, False) + aft = LinkedListNode(after, False) mid.next = aft aft.next = tmp # ====== @@ -152,6 +188,8 @@ class LatexPaperSplit(): split_worker(root, r"\\subsubsection\{(.*?)\}") split_worker(root, r"\\bibliography\{(.*?)\}") split_worker(root, r"\\bibliographystyle\{(.*?)\}") + split_worker(root, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + split_worker(root, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) split_worker(root, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) split_worker(root, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) split_worker(root, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) @@ -178,13 +216,17 @@ class LatexPaperSplit(): node = node.next if node is None: break - with open('debug_log', 'w', encoding='utf8') as f: + # 将分解结果返回 res_to_t + with open('debug_log.html', 'w', encoding='utf8') as f: res_to_t = [] node = root while True: + show_html = node.string.replace('\n','
') if not node.preserve: res_to_t.append(node.string) - f.write(node.string) + f.write(f'

{show_html}

') + else: + f.write(f'

{show_html}

') node = node.next if node is None: break @@ -260,7 +302,6 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin with open(maintex, 'r', encoding='utf-8', errors='replace') as f: content = f.read() merged_content = merge_tex_files(project_folder, content, mode) - merged_content = re.sub(r'(?