diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index 810d802..e79cf82 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -157,7 +157,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo try: import glob, os, time, subprocess subprocess.Popen(['pdflatex', '-version']) - from .latex_utils import Latex精细分解与转化, 编译Latex + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) @@ -234,7 +234,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, try: import glob, os, time, subprocess subprocess.Popen(['pdflatex', '-version']) - from .latex_utils import Latex精细分解与转化, 编译Latex + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex except Exception as e: chatbot.append([ f"解析项目: {txt}", f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index 8b6b540..94f4dd0 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -195,9 +195,10 @@ def test_Latex(): # txt = r"https://arxiv.org/abs/2303.08774" # txt = r"https://arxiv.org/abs/2303.12712" # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder" - txt = r"2306.17157" # 这个paper有个input命令文件名大小写错误! - - + # txt = r"2306.17157" # 这个paper有个input命令文件名大小写错误! + # txt = "https://arxiv.org/abs/2205.14135" + # txt = r"C:\Users\fuqingxu\arxiv_cache\2205.14135\workfolder" + txt = r"C:\Users\fuqingxu\arxiv_cache\2205.14135\workfolder" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) @@ -240,7 +241,7 @@ if __name__ == "__main__": # test_数学动画生成manim() # test_Langchain知识库() # test_Langchain知识库读取() - # test_Latex() - test_chatglm_finetune() + test_Latex() + # test_chatglm_finetune() input("程序完成,回车退出。") print("退出。") \ No newline at end of file diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_fns/latex_actions.py similarity index 58% rename from crazy_functions/latex_utils.py rename to crazy_functions/latex_fns/latex_actions.py index b3340e7..8ca7ca3 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -1,320 +1,16 @@ from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界面 from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone +from .latex_toolbox import PRESERVE, TRANSFORM +from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace +from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process +from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout + import os, shutil import re import numpy as np + pj = os.path.join - -""" -======================================================================== -Part One -Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) -======================================================================== -""" -PRESERVE = 0 -TRANSFORM = 1 - -def set_forbidden_text(text, mask, pattern, flags=0): - """ - Add a preserve text area in this paper - e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" - you can mask out (mask = PRESERVE so that text become untouchable for GPT) - everything between "\begin{equation}" and "\end{equation}" - """ - if isinstance(pattern, list): pattern = '|'.join(pattern) - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - mask[res.span()[0]:res.span()[1]] = PRESERVE - return text, mask - -def reverse_forbidden_text(text, mask, pattern, flags=0, forbid_wrapper=True): - """ - Move area out of preserve area (make text editable for GPT) - count the number of the braces so as to catch compelete text area. - e.g. - \begin{abstract} blablablablablabla. \end{abstract} - """ - if isinstance(pattern, list): pattern = '|'.join(pattern) - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - if not forbid_wrapper: - mask[res.span()[0]:res.span()[1]] = TRANSFORM - else: - mask[res.regs[0][0]: res.regs[1][0]] = PRESERVE # '\\begin{abstract}' - mask[res.regs[1][0]: res.regs[1][1]] = TRANSFORM # abstract - mask[res.regs[1][1]: res.regs[0][1]] = PRESERVE # abstract - return text, mask - -def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): - """ - Add a preserve text area in this paper (text become untouchable for GPT). - count the number of the braces so as to catch compelete text area. - e.g. - \caption{blablablablabla\texbf{blablabla}blablabla.} - """ - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - brace_level = -1 - p = begin = end = res.regs[0][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 - p += 1 - end = p+1 - mask[begin:end] = PRESERVE - return text, mask - -def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True): - """ - Move area out of preserve area (make text editable for GPT) - count the number of the braces so as to catch compelete text area. - e.g. - \caption{blablablablabla\texbf{blablabla}blablabla.} - """ - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - brace_level = 0 - p = begin = end = res.regs[1][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 - p += 1 - end = p - mask[begin:end] = TRANSFORM - if forbid_wrapper: - mask[res.regs[0][0]:begin] = PRESERVE - mask[end:res.regs[0][1]] = PRESERVE - return text, mask - -def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): - """ - Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. - Add it to preserve area - """ - pattern_compile = re.compile(pattern, flags) - def search_with_line_limit(text, mask): - for res in pattern_compile.finditer(text): - cmd = res.group(1) # begin{what} - this = res.group(2) # content between begin and end - this_mask = mask[res.regs[2][0]:res.regs[2][1]] - white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', - 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] - if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42 - this, this_mask = search_with_line_limit(this, this_mask) - mask[res.regs[2][0]:res.regs[2][1]] = this_mask - else: - mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE - return text, mask - return search_with_line_limit(text, mask) - -class LinkedListNode(): - """ - Linked List Node - """ - def __init__(self, string, preserve=True) -> None: - self.string = string - self.preserve = preserve - self.next = None - # self.begin_line = 0 - # self.begin_char = 0 - -def convert_to_linklist(text, mask): - root = LinkedListNode("", preserve=True) - current_node = root - for c, m, i in zip(text, mask, range(len(text))): - if (m==PRESERVE and current_node.preserve) \ - or (m==TRANSFORM and not current_node.preserve): - # add - current_node.string += c - else: - current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) - current_node = current_node.next - return root -""" -======================================================================== -Latex Merge File -======================================================================== -""" - -def 寻找Latex主文件(file_manifest, mode): - """ - 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 - P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) - """ - canidates = [] - for texf in file_manifest: - if os.path.basename(texf).startswith('merge'): - continue - with open(texf, 'r', encoding='utf8', errors='ignore') as f: - file_content = f.read() - if r'\documentclass' in file_content: - canidates.append(texf) - else: - continue - - if len(canidates) == 0: - raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') - elif len(canidates) == 1: - return canidates[0] - else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 - canidates_score = [] - # 给出一些判定模板文档的词作为扣分项 - unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] - expected_words = ['\input', '\ref', '\cite'] - for texf in canidates: - canidates_score.append(0) - with open(texf, 'r', encoding='utf8', errors='ignore') as f: - file_content = f.read() - for uw in unexpected_words: - if uw in file_content: - canidates_score[-1] -= 1 - for uw in expected_words: - if uw in file_content: - canidates_score[-1] += 1 - select = np.argmax(canidates_score) # 取评分最高者返回 - return canidates[select] - -def rm_comments(main_file): - new_file_remove_comment_lines = [] - for l in main_file.splitlines(): - # 删除整行的空注释 - if l.lstrip().startswith("%"): - pass - else: - new_file_remove_comment_lines.append(l) - main_file = '\n'.join(new_file_remove_comment_lines) - # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 - main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'): - # walk and replace any _ without \ - final_tex = re.sub(r"(?') if not node.preserve: segment_parts_for_gpt.append(node.string) - f.write(f'

#{show_html}#

') + f.write(f'

#{node.range}{show_html}#

') else: f.write(f'

{show_html}

') node = node.next @@ -448,8 +76,6 @@ def split_subprocess(txt, project_folder, return_dict, opts): return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt return return_dict - - class LatexPaperSplit(): """ break down latex file to a linked list, @@ -464,18 +90,32 @@ class LatexPaperSplit(): # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" - def merge_result(self, arr, mode, msg): + + def merge_result(self, arr, mode, msg, buggy_lines=[]): """ Merge the result after the GPT process completed """ result_string = "" - p = 0 + node_cnt = 0 + line_cnt = 0 + for node in self.nodes: if node.preserve: + line_cnt += node.string.count('\n') result_string += node.string else: - result_string += fix_content(arr[p], node.string) - p += 1 + translated_txt = fix_content(arr[node_cnt], node.string) + begin_line = line_cnt + end_line = line_cnt + translated_txt.count('\n') + + # reverse translation if any error + if any([begin_line-buggy_line_surgery_n_lines <= b_line <= end_line+buggy_line_surgery_n_lines for b_line in buggy_lines]): + translated_txt = node.string + + result_string += translated_txt + node_cnt += 1 + line_cnt += translated_txt.count('\n') + if mode == 'translate_zh': pattern = re.compile(r'\\begin\{abstract\}.*\n') match = pattern.search(result_string) @@ -490,6 +130,7 @@ class LatexPaperSplit(): result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] return result_string + def split(self, txt, project_folder, opts): """ break down latex file to a linked list, @@ -511,7 +152,6 @@ class LatexPaperSplit(): return self.sp - class LatexPaperFileGroup(): """ use tokenizer to break down text according to max_token_limit @@ -539,7 +179,7 @@ class LatexPaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from ..crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) @@ -560,41 +200,14 @@ class LatexPaperFileGroup(): f.write(res) return manifest -def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): - - # write html - try: - import shutil - from .crazy_utils import construct_html - from toolbox import gen_time_str - ch = construct_html() - orig = "" - trans = "" - final = [] - for c,r in zip(sp_file_contents, sp_file_result): - final.append(c) - final.append(r) - for i, k in enumerate(final): - if i%2==0: - orig = k - if i%2==1: - trans = k - ch.add_row(a=orig, b=trans) - create_report_file_name = f"{gen_time_str()}.trans.html" - ch.save_file(create_report_file_name) - shutil.copyfile(pj('./gpt_log/', create_report_file_name), pj(project_folder, create_report_file_name)) - promote_file_to_downloadzone(file=f'./gpt_log/{create_report_file_name}', chatbot=chatbot) - except: - from toolbox import trimmed_format_exc - print('writing html result failed:', trimmed_format_exc()) def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): import time, os, re - from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件 + from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + from .latex_actions import LatexPaperFileGroup, LatexPaperSplit # <-------- 寻找主tex文件 ----------> - maintex = 寻找Latex主文件(file_manifest, mode) + maintex = find_main_tex_file(file_manifest, mode) chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 time.sleep(3) @@ -668,54 +281,51 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin # <-------- 写出文件 ----------> msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" final_tex = lps.merge_result(pfg.file_result, mode, msg) + objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl')) + with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) # <-------- 整理结果, 退出 ----------> - chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF')) + chatbot.append((f"完成了吗?", 'GPT结果已输出, 即将编译PDF')) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # <-------- 返回 ----------> return project_folder + f'/merge_{mode}.tex' - -def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified): +def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified, fixed_line=[]): try: with open(log_path, 'r', encoding='utf-8', errors='replace') as f: log = f.read() - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - file_lines = f.readlines() import re buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) buggy_lines = [int(l) for l in buggy_lines] buggy_lines = sorted(buggy_lines) - print("removing lines that has errors", buggy_lines) - file_lines.pop(buggy_lines[0]-1) + buggy_line = buggy_lines[0]-1 + print("reversing tex line that has errors", buggy_line) + + # 重组,逆转出错的段落 + if buggy_line in fixed_line: raise RuntimeError + fixed_line.append(buggy_line) + + lps, file_result, mode, msg = objload(file=pj(work_folder_modified,'merge_result.pkl')) + final_tex = lps.merge_result(file_result, mode, msg, buggy_lines=fixed_line) + with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: - f.writelines(file_lines) + f.write(final_tex) + return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines except: print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") return False, -1, [-1] - -def compile_latex_with_timeout(command, cwd, timeout=60): - import subprocess - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) - try: - stdout, stderr = process.communicate(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - stdout, stderr = process.communicate() - print("Process timed out!") - return False - return True + def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): import os, time - current_dir = os.getcwd() n_fix = 1 + fixed_line = [] max_try = 32 chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 @@ -723,6 +333,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f while True: import os + may_exist_bbl = pj(work_folder_modified, f'merge.bbl') + target_bbl = pj(work_folder_modified, f'{main_file_modified}.bbl') + if os.path.exists(may_exist_bbl) and not os.path.exists(target_bbl): + shutil.copyfile(may_exist_bbl, target_bbl) # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 @@ -756,7 +370,6 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - # <---------- 检查结果 -----------> results_ = "" original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) @@ -783,7 +396,7 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f concat_pdf = pj(work_folder_modified, f'comparison.pdf') merge_pdfs(origin_pdf, result_pdf, concat_pdf) promote_file_to_downloadzone(concat_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - except: + except Exception as e: pass return True # 成功啦 else: @@ -796,6 +409,7 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f tex_name_pure=f'{main_file_modified}', n_fix=n_fix, work_folder_modified=work_folder_modified, + fixed_line=fixed_line ) yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面 if not can_retry: break @@ -803,14 +417,15 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f return False # 失败啦 + def merge_pdfs(pdf1_path, pdf2_path, output_path): import PyPDF2 # Open the first PDF file with open(pdf1_path, 'rb') as pdf1_file: - pdf1_reader = PyPDF2.PdfFileReader(pdf1_file) + pdf1_reader = PyPDF2.PdfReader(pdf1_file) # Open the second PDF file with open(pdf2_path, 'rb') as pdf2_file: - pdf2_reader = PyPDF2.PdfFileReader(pdf2_file) + pdf2_reader = PyPDF2.PdfReader(pdf2_file) # Create a new PDF file to store the merged pages output_writer = PyPDF2.PdfFileWriter() # Determine the number of pages in each PDF file @@ -837,4 +452,33 @@ def merge_pdfs(pdf1_path, pdf2_path, output_path): output_writer.addPage(new_page) # Save the merged PDF file with open(output_path, 'wb') as output_file: - output_writer.write(output_file) \ No newline at end of file + output_writer.write(output_file) + + + +def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): + # write html + try: + import shutil + from ..crazy_utils import construct_html + from toolbox import gen_time_str + ch = construct_html() + orig = "" + trans = "" + final = [] + for c,r in zip(sp_file_contents, sp_file_result): + final.append(c) + final.append(r) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{gen_time_str()}.trans.html" + ch.save_file(create_report_file_name) + shutil.copyfile(pj('./gpt_log/', create_report_file_name), pj(project_folder, create_report_file_name)) + promote_file_to_downloadzone(file=f'./gpt_log/{create_report_file_name}', chatbot=chatbot) + except: + from toolbox import trimmed_format_exc + print('writing html result failed:', trimmed_format_exc()) diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py new file mode 100644 index 0000000..0825044 --- /dev/null +++ b/crazy_functions/latex_fns/latex_toolbox.py @@ -0,0 +1,417 @@ +import os, shutil +import re +import numpy as np +PRESERVE = 0 +TRANSFORM = 1 + +pj = os.path.join + +class LinkedListNode(): + """ + Linked List Node + """ + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + self.range = None + # self.begin_line = 0 + # self.begin_char = 0 + +def convert_to_linklist(text, mask): + root = LinkedListNode("", preserve=True) + current_node = root + for c, m, i in zip(text, mask, range(len(text))): + if (m==PRESERVE and current_node.preserve) \ + or (m==TRANSFORM and not current_node.preserve): + # add + current_node.string += c + else: + current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) + current_node = current_node.next + return root + +def post_process(root): + # 修复括号 + node = root + while True: + string = node.string + if node.preserve: + node = node.next + if node is None: break + continue + def break_check(string): + str_stack = [""] # (lv, index) + for i, c in enumerate(string): + if c == '{': + str_stack.append('{') + elif c == '}': + if len(str_stack) == 1: + print('stack fix') + return i + str_stack.pop(-1) + else: + str_stack[-1] += c + return -1 + bp = break_check(string) + + if bp == -1: + pass + elif bp == 0: + node.string = string[:1] + q = LinkedListNode(string[1:], False) + q.next = node.next + node.next = q + else: + node.string = string[:bp] + q = LinkedListNode(string[bp:], False) + q.next = node.next + node.next = q + + node = node.next + if node is None: break + + # 屏蔽空行和太短的句子 + node = root + while True: + if len(node.string.strip('\n').strip(''))==0: node.preserve = True + if len(node.string.strip('\n').strip(''))<42: node.preserve = True + node = node.next + if node is None: break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: break + + # 将前后断行符脱离 + node = root + prev_node = None + while True: + if not node.preserve: + lstriped_ = node.string.lstrip().lstrip('\n') + if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): + prev_node.string += node.string[:-len(lstriped_)] + node.string = lstriped_ + rstriped_ = node.string.rstrip().rstrip('\n') + if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): + node.next.string = node.string[len(rstriped_):] + node.next.string + node.string = rstriped_ + # ===== + prev_node = node + node = node.next + if node is None: break + + # 标注节点的行数范围 + node = root + n_line = 0 + expansion = 2 + while True: + n_l = node.string.count('\n') + node.range = [n_line-expansion, n_line+n_l+expansion] # 失败时,扭转的范围 + n_line = n_line+n_l + node = node.next + if node is None: break + return root + + +""" +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +""" + + +def set_forbidden_text(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper + e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" + you can mask out (mask = PRESERVE so that text become untouchable for GPT) + everything between "\begin{equation}" and "\end{equation}" + """ + if isinstance(pattern, list): pattern = '|'.join(pattern) + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + mask[res.span()[0]:res.span()[1]] = PRESERVE + return text, mask + +def reverse_forbidden_text(text, mask, pattern, flags=0, forbid_wrapper=True): + """ + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \begin{abstract} blablablablablabla. \end{abstract} + """ + if isinstance(pattern, list): pattern = '|'.join(pattern) + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + if not forbid_wrapper: + mask[res.span()[0]:res.span()[1]] = TRANSFORM + else: + mask[res.regs[0][0]: res.regs[1][0]] = PRESERVE # '\\begin{abstract}' + mask[res.regs[1][0]: res.regs[1][1]] = TRANSFORM # abstract + mask[res.regs[1][1]: res.regs[0][1]] = PRESERVE # abstract + return text, mask + +def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper (text become untouchable for GPT). + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = -1 + p = begin = end = res.regs[0][0] + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 + p += 1 + end = p+1 + mask[begin:end] = PRESERVE + return text, mask + +def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True): + """ + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = 0 + p = begin = end = res.regs[1][0] + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 + p += 1 + end = p + mask[begin:end] = TRANSFORM + if forbid_wrapper: + mask[res.regs[0][0]:begin] = PRESERVE + mask[end:res.regs[0][1]] = PRESERVE + return text, mask + +def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): + """ + Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. + Add it to preserve area + """ + pattern_compile = re.compile(pattern, flags) + def search_with_line_limit(text, mask): + for res in pattern_compile.finditer(text): + cmd = res.group(1) # begin{what} + this = res.group(2) # content between begin and end + this_mask = mask[res.regs[2][0]:res.regs[2][1]] + white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', + 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] + if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42 + this, this_mask = search_with_line_limit(this, this_mask) + mask[res.regs[2][0]:res.regs[2][1]] = this_mask + else: + mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE + return text, mask + return search_with_line_limit(text, mask) + + + +""" +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +Latex Merge File +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +""" + +def find_main_tex_file(file_manifest, mode): + """ + 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 + P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) + """ + canidates = [] + for texf in file_manifest: + if os.path.basename(texf).startswith('merge'): + continue + with open(texf, 'r', encoding='utf8', errors='ignore') as f: + file_content = f.read() + if r'\documentclass' in file_content: + canidates.append(texf) + else: + continue + + if len(canidates) == 0: + raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') + elif len(canidates) == 1: + return canidates[0] + else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 + canidates_score = [] + # 给出一些判定模板文档的词作为扣分项 + unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] + expected_words = ['\input', '\ref', '\cite'] + for texf in canidates: + canidates_score.append(0) + with open(texf, 'r', encoding='utf8', errors='ignore') as f: + file_content = f.read() + for uw in unexpected_words: + if uw in file_content: + canidates_score[-1] -= 1 + for uw in expected_words: + if uw in file_content: + canidates_score[-1] += 1 + select = np.argmax(canidates_score) # 取评分最高者返回 + return canidates[select] + +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.lstrip().startswith("%"): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 + main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'): + # walk and replace any _ without \ + final_tex = re.sub(r"(?