merge regex expressions

This commit is contained in:
qingxu fu 2023-06-27 18:59:56 +08:00
parent e90048a671
commit f654c1af31
2 changed files with 39 additions and 41 deletions

View File

@ -190,9 +190,11 @@ def test_Latex():
# txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE
# txt = r"https://arxiv.org/abs/2002.09253" # txt = r"https://arxiv.org/abs/2002.09253"
# txt = r"https://arxiv.org/abs/2306.07831" # txt = r"https://arxiv.org/abs/2306.07831"
# txt = r"https://arxiv.org/abs/2212.10156" txt = r"https://arxiv.org/abs/2212.10156"
# txt = r"https://arxiv.org/abs/2211.11559" # txt = r"https://arxiv.org/abs/2211.11559"
txt = r"https://arxiv.org/abs/2303.08774" # txt = r"https://arxiv.org/abs/2303.08774"
# txt = r"https://arxiv.org/abs/2303.12712"
# txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder"
for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):

View File

@ -21,6 +21,7 @@ def set_forbidden_text(text, mask, pattern, flags=0):
you can mask out (mask = PRESERVE so that text become untouchable for GPT) you can mask out (mask = PRESERVE so that text become untouchable for GPT)
everything between "\begin{equation}" and "\end{equation}" everything between "\begin{equation}" and "\end{equation}"
""" """
if isinstance(pattern, list): pattern = '|'.join(pattern)
pattern_compile = re.compile(pattern, flags) pattern_compile = re.compile(pattern, flags)
for res in pattern_compile.finditer(text): for res in pattern_compile.finditer(text):
mask[res.span()[0]:res.span()[1]] = PRESERVE mask[res.span()[0]:res.span()[1]] = PRESERVE
@ -46,7 +47,7 @@ def set_forbidden_text_careful_brace(text, mask, pattern, flags=0):
mask[begin:end] = PRESERVE mask[begin:end] = PRESERVE
return text, mask return text, mask
def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0): def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True):
""" """
Move area out of preserve area (make text editable for GPT) Move area out of preserve area (make text editable for GPT)
count the number of the braces so as to catch compelete text area. count the number of the braces so as to catch compelete text area.
@ -64,6 +65,9 @@ def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0):
p += 1 p += 1
end = p end = p
mask[begin:end] = TRANSFORM mask[begin:end] = TRANSFORM
if forbid_wrapper:
mask[res.regs[0][0]:begin] = PRESERVE
mask[end:res.regs[0][1]] = PRESERVE
return text, mask return text, mask
def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
@ -163,6 +167,7 @@ def rm_comments(main_file):
else: else:
new_file_remove_comment_lines.append(l) new_file_remove_comment_lines.append(l)
main_file = '\n'.join(new_file_remove_comment_lines) main_file = '\n'.join(new_file_remove_comment_lines)
# main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令
main_file = re.sub(r'(?<!\\)%.*', '', main_file) # 使用正则表达式查找半行注释, 并替换为空字符串 main_file = re.sub(r'(?<!\\)%.*', '', main_file) # 使用正则表达式查找半行注释, 并替换为空字符串
return main_file return main_file
@ -209,9 +214,11 @@ def merge_tex_files(project_foler, main_file, mode):
main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows,UTF8]{\2}",main_file) main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows,UTF8]{\2}",main_file)
main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows,UTF8]{\1}",main_file) main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows,UTF8]{\1}",main_file)
# find paper abstract # find paper abstract
pattern = re.compile(r'\\begin\{abstract\}.*\n') pattern_opt1 = re.compile(r'\\begin\{abstract\}.*\n')
match = pattern.search(main_file) pattern_opt2 = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
assert match is not None, "Cannot find paper abstract section!" match_opt1 = pattern_opt1.search(main_file)
match_opt2 = pattern_opt2.search(main_file)
assert (match_opt1 is not None) or (match_opt2 is not None), "Cannot find paper abstract section!"
return main_file return main_file
@ -293,48 +300,32 @@ def split_subprocess(txt, project_folder, return_dict, opts):
# 吸收title与作者以上的部分 # 吸收title与作者以上的部分
text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL)
# 删除iffalse注释 # 吸收iffalse注释
text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
# 吸收在25行以内的begin-end组合 # 吸收在25行以内的begin-end组合
text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42) text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
# 吸收匿名公式 # 吸收匿名公式
text, mask = set_forbidden_text(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) text, mask = set_forbidden_text(text, mask, [ r"\$\$(.*?)\$\$", r"\\\[.*?\\\]" ], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\\[.*?\\\]", re.DOTALL)
# 吸收其他杂项 # 吸收其他杂项
text, mask = set_forbidden_text(text, mask, r"\\section\{(.*?)\}") text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
text, mask = set_forbidden_text(text, mask, r"\\section\*\{(.*?)\}") text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
text, mask = set_forbidden_text(text, mask, r"\\subsection\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\subsubsection\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\bibliography\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\bibliographystyle\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}"])
text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\includepdf\[(.*?)\]\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\item ")
text, mask = set_forbidden_text(text, mask, r"\\label\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\begin\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\vspace\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\hspace\{(.*?)\}")
text, mask = set_forbidden_text(text, mask, r"\\end\{(.*?)\}")
text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL) text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
# reverse 操作必须放在最后 # reverse 操作必须放在最后
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
root = convert_to_linklist(text, mask) root = convert_to_linklist(text, mask)
# 修复括号 # 修复括号
@ -408,7 +399,7 @@ def split_subprocess(txt, project_folder, return_dict, opts):
prev_node = node prev_node = node
node = node.next node = node.next
if node is None: break if node is None: break
# 输出html调试文件用红色标注处保留区PRESERVE用黑色标注转换区TRANSFORM
with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
segment_parts_for_gpt = [] segment_parts_for_gpt = []
nodes = [] nodes = []
@ -461,9 +452,13 @@ class LatexPaperSplit():
pattern = re.compile(r'\\begin\{abstract\}.*\n') pattern = re.compile(r'\\begin\{abstract\}.*\n')
match = pattern.search(result_string) match = pattern.search(result_string)
if not match: if not match:
pattern = re.compile(r'\\abstract\{') # match \abstract{xxxx}
match = pattern.search(result_string) pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
position = match.end() match = pattern_compile.search(result_string)
position = match.regs[1][0]
else:
# match \begin{abstract}xxxx\end{abstract}
position = match.end()
result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
return result_string return result_string
@ -482,6 +477,7 @@ class LatexPaperSplit():
args=(txt, project_folder, return_dict, opts)) args=(txt, project_folder, return_dict, opts))
p.start() p.start()
p.join() p.join()
p.close()
self.nodes = return_dict['nodes'] self.nodes = return_dict['nodes']
self.sp = return_dict['segment_parts_for_gpt'] self.sp = return_dict['segment_parts_for_gpt']
return self.sp return self.sp