From 9a5a509dd9c85949a89a7ef763572dca92afeb46 Mon Sep 17 00:00:00 2001 From: OverKit Date: Sat, 17 Jun 2023 19:27:21 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=85=B3=E4=BA=8Eabstrac?= =?UTF-8?q?t=E7=9A=84=E6=90=9C=E7=B4=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/latex_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 78eec29..3734f00 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -424,7 +424,9 @@ class LatexPaperSplit(): if mode == 'translate_zh': pattern = re.compile(r'\\begin\{abstract\}.*\n') match = pattern.search(result_string) - assert match is not None, "Cannot find paper abstract section!" + if not match: + pattern = re.compile(r'\\abstract\{') + match = pattern.search(result_string) position = match.end() result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] return result_string From 7fdf0a8e51ee7acfcb2822d07a6c3ed1e8c52846 Mon Sep 17 00:00:00 2001 From: OverKit Date: Sun, 18 Jun 2023 15:51:29 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8C=BA=E5=88=86?= =?UTF-8?q?=E5=86=85=E5=AE=B9=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/latex_utils.py | 81 +++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 3734f00..eebce80 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -23,38 +23,67 @@ def split_worker(text, mask, pattern, flags=0): mask[res.span()[0]:res.span()[1]] = PRESERVE return text, mask -def split_worker_careful_brace(text, mask, pattern, flags=0): +def set_transform_area(text, mask, pattern, flags=0): """ - Move area into preserve area + Add a transform text area in this paper """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): - brace_level = -1 - p = begin = end = res.regs[0][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 - p += 1 - end = p+1 - mask[begin:end] = PRESERVE + mask[res.span()[0] : res.span()[1]] = TRANSFORM return text, mask + +def split_worker_careful_brace(text, mask, pattern, flags=0): + """ + Move area into preserve area. + It is better to wrap the curly braces in the capture group, e.g., r"\\captioin(\{.*\})". + """ + pattern_compile = re.compile(pattern, flags) + res = pattern_compile.search(text) + + # 确保捕获组存在 + if res and len(res.regs) > 1: + brace_level = 0 + p = begin = end = res.regs[1][0] + for _ in range(1024 * 16): + if text[p] == "}" and brace_level == 1: + break + elif text[p] == "}": + brace_level -= 1 + elif text[p] == "{": + brace_level += 1 + p += 1 + end = p + mask[begin + 1 : end] = PRESERVE + split_worker_careful_brace(text[end:], mask[end:], pattern, flags=flags) + + return text, mask + + def split_worker_reverse_careful_brace(text, mask, pattern, flags=0): """ - Move area out of preserve area + Move area out of preserve area. + It is better to wrap the curly braces in the capture group, e.g., r"\\captioin(\{.*\})". """ pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): + res = pattern_compile.search(text) + + # 确保捕获组存在 + if res and len(res.regs) > 1: brace_level = 0 p = begin = end = res.regs[1][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 + for _ in range(1024 * 16): + if text[p] == "}" and brace_level == 1: + break + elif text[p] == "}": + brace_level -= 1 + elif text[p] == "{": + brace_level += 1 p += 1 end = p - mask[begin:end] = TRANSFORM + mask[begin + 1 : end] = TRANSFORM + split_worker_reverse_careful_brace(text[end:], mask[end:], pattern, flags=flags) + return text, mask def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): @@ -260,13 +289,14 @@ def split_subprocess(txt, project_folder, return_dict, opts): mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM # 吸收title与作者以上的部分 - text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) + text, mask = split_worker(text, mask, r".*?\\begin\{document\}", re.DOTALL) # 删除iffalse注释 text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) # 吸收在25行以内的begin-end组合 text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) # 吸收匿名公式 text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) + text, mask = split_worker(text, mask, r"\\\[.*?\\\]", re.DOTALL) # 吸收其他杂项 text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") @@ -274,6 +304,7 @@ def split_subprocess(txt, project_folder, return_dict, opts): text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) @@ -293,12 +324,18 @@ def split_subprocess(txt, project_folder, return_dict, opts): text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) text, mask = split_worker(text, mask, r"\\item ") text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") + + text, mask = set_transform_area(text, mask, r"\\begin\{abstract\}.*?\\end\{abstract\}", re.DOTALL) + + text, mask = split_worker_careful_brace(text, mask, r"\\hl(\{.*\})", re.DOTALL) + text, mask = split_worker_reverse_careful_brace(text, mask, r"\\caption(\{.*\})", re.DOTALL) + text, mask = split_worker_reverse_careful_brace(text, mask, r"\\abstract(\{.*\})", re.DOTALL) + + text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") - text, mask = split_worker_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL) - text, mask = split_worker_reverse_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) + root = convert_to_linklist(text, mask) # 修复括号 From d5bab093f94523665c5b0a6b7781dd491123faff Mon Sep 17 00:00:00 2001 From: 505030475 <505030475@qq.com> Date: Mon, 19 Jun 2023 15:17:33 +1000 Subject: [PATCH 3/3] rename function names --- crazy_functions/latex_utils.py | 163 ++++++++++++++------------------- 1 file changed, 69 insertions(+), 94 deletions(-) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index eebce80..a984b2f 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -8,85 +8,65 @@ pj = os.path.join """ ======================================================================== Part One -Latex segmentation to a linklist +Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) ======================================================================== """ PRESERVE = 0 TRANSFORM = 1 -def split_worker(text, mask, pattern, flags=0): +def set_forbidden_text(text, mask, pattern, flags=0): """ Add a preserve text area in this paper + e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" + you can mask out (mask = PRESERVE so that text become untouchable for GPT) + everything between "\begin{equation}" and "\end{equation}" """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): mask[res.span()[0]:res.span()[1]] = PRESERVE return text, mask -def set_transform_area(text, mask, pattern, flags=0): +def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): """ - Add a transform text area in this paper + Add a preserve text area in this paper (text become untouchable for GPT). + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): - mask[res.span()[0] : res.span()[1]] = TRANSFORM + brace_level = -1 + p = begin = end = res.regs[0][0] + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 + p += 1 + end = p+1 + mask[begin:end] = PRESERVE return text, mask - -def split_worker_careful_brace(text, mask, pattern, flags=0): +def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0): """ - Move area into preserve area. - It is better to wrap the curly braces in the capture group, e.g., r"\\captioin(\{.*\})". + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} """ pattern_compile = re.compile(pattern, flags) - res = pattern_compile.search(text) - - # 确保捕获组存在 - if res and len(res.regs) > 1: + for res in pattern_compile.finditer(text): brace_level = 0 p = begin = end = res.regs[1][0] - for _ in range(1024 * 16): - if text[p] == "}" and brace_level == 1: - break - elif text[p] == "}": - brace_level -= 1 - elif text[p] == "{": - brace_level += 1 + for _ in range(1024*16): + if text[p] == '}' and brace_level == 0: break + elif text[p] == '}': brace_level -= 1 + elif text[p] == '{': brace_level += 1 p += 1 end = p - mask[begin + 1 : end] = PRESERVE - split_worker_careful_brace(text[end:], mask[end:], pattern, flags=flags) - + mask[begin:end] = TRANSFORM return text, mask - -def split_worker_reverse_careful_brace(text, mask, pattern, flags=0): - """ - Move area out of preserve area. - It is better to wrap the curly braces in the capture group, e.g., r"\\captioin(\{.*\})". - """ - pattern_compile = re.compile(pattern, flags) - res = pattern_compile.search(text) - - # 确保捕获组存在 - if res and len(res.regs) > 1: - brace_level = 0 - p = begin = end = res.regs[1][0] - for _ in range(1024 * 16): - if text[p] == "}" and brace_level == 1: - break - elif text[p] == "}": - brace_level -= 1 - elif text[p] == "{": - brace_level += 1 - p += 1 - end = p - mask[begin + 1 : end] = TRANSFORM - split_worker_reverse_careful_brace(text[end:], mask[end:], pattern, flags=flags) - - return text, mask - -def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): +def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): """ Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. Add it to preserve area @@ -289,53 +269,48 @@ def split_subprocess(txt, project_folder, return_dict, opts): mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM # 吸收title与作者以上的部分 - text, mask = split_worker(text, mask, r".*?\\begin\{document\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL) # 删除iffalse注释 - text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) # 吸收在25行以内的begin-end组合 - text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) + text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42) # 吸收匿名公式 - text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) - text, mask = split_worker(text, mask, r"\\\[.*?\\\]", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\\[.*?\\\]", re.DOTALL) # 吸收其他杂项 - text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\item ") - text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") - - text, mask = set_transform_area(text, mask, r"\\begin\{abstract\}.*?\\end\{abstract\}", re.DOTALL) - - text, mask = split_worker_careful_brace(text, mask, r"\\hl(\{.*\})", re.DOTALL) - text, mask = split_worker_reverse_careful_brace(text, mask, r"\\caption(\{.*\})", re.DOTALL) - text, mask = split_worker_reverse_careful_brace(text, mask, r"\\abstract(\{.*\})", re.DOTALL) - - text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") - + text, mask = set_forbidden_text(text, mask, r"\\section\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\section\*\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\subsection\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\subsubsection\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\bibliography\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\bibliographystyle\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\item ") + text, mask = set_forbidden_text(text, mask, r"\\label\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\begin\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\vspace\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\hspace\{(.*?)\}") + text, mask = set_forbidden_text(text, mask, r"\\end\{(.*?)\}") + text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL) + # reverse 操作必须放在最后 + text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) root = convert_to_linklist(text, mask) # 修复括号