diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index 3ef555d..f2d3969 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -190,9 +190,11 @@ def test_Latex(): # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE # txt = r"https://arxiv.org/abs/2002.09253" # txt = r"https://arxiv.org/abs/2306.07831" - # txt = r"https://arxiv.org/abs/2212.10156" + txt = r"https://arxiv.org/abs/2212.10156" # txt = r"https://arxiv.org/abs/2211.11559" - txt = r"https://arxiv.org/abs/2303.08774" + # txt = r"https://arxiv.org/abs/2303.08774" + # txt = r"https://arxiv.org/abs/2303.12712" + # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 3e4f37c..49f547c 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -8,24 +8,31 @@ pj = os.path.join """ ======================================================================== Part One -Latex segmentation to a linklist +Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) ======================================================================== """ PRESERVE = 0 TRANSFORM = 1 -def split_worker(text, mask, pattern, flags=0): +def set_forbidden_text(text, mask, pattern, flags=0): """ Add a preserve text area in this paper + e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" + you can mask out (mask = PRESERVE so that text become untouchable for GPT) + everything between "\begin{equation}" and "\end{equation}" """ + if isinstance(pattern, list): pattern = '|'.join(pattern) pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): mask[res.span()[0]:res.span()[1]] = PRESERVE return text, mask -def split_worker_careful_brace(text, mask, pattern, flags=0): +def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): """ - Move area into preserve area + Add a preserve text area in this paper (text become untouchable for GPT). + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): @@ -40,9 +47,12 @@ def split_worker_careful_brace(text, mask, pattern, flags=0): mask[begin:end] = PRESERVE return text, mask -def split_worker_reverse_careful_brace(text, mask, pattern, flags=0): +def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True): """ - Move area out of preserve area + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): @@ -55,9 +65,12 @@ def split_worker_reverse_careful_brace(text, mask, pattern, flags=0): p += 1 end = p mask[begin:end] = TRANSFORM + if forbid_wrapper: + mask[res.regs[0][0]:begin] = PRESERVE + mask[end:res.regs[0][1]] = PRESERVE return text, mask -def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): +def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): """ Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. Add it to preserve area @@ -154,6 +167,7 @@ def rm_comments(main_file): else: new_file_remove_comment_lines.append(l) main_file = '\n'.join(new_file_remove_comment_lines) + # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 main_file = re.sub(r'(?