merge regex expressions
This commit is contained in:
		
							parent
							
								
									e90048a671
								
							
						
					
					
						commit
						f654c1af31
					
				@ -190,9 +190,11 @@ def test_Latex():
 | 
				
			|||||||
    # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder"  #  ACE
 | 
					    # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder"  #  ACE
 | 
				
			||||||
    # txt = r"https://arxiv.org/abs/2002.09253"
 | 
					    # txt = r"https://arxiv.org/abs/2002.09253"
 | 
				
			||||||
    # txt = r"https://arxiv.org/abs/2306.07831"
 | 
					    # txt = r"https://arxiv.org/abs/2306.07831"
 | 
				
			||||||
    # txt = r"https://arxiv.org/abs/2212.10156"
 | 
					    txt = r"https://arxiv.org/abs/2212.10156"
 | 
				
			||||||
    # txt = r"https://arxiv.org/abs/2211.11559"
 | 
					    # txt = r"https://arxiv.org/abs/2211.11559"
 | 
				
			||||||
    txt = r"https://arxiv.org/abs/2303.08774"
 | 
					    # txt = r"https://arxiv.org/abs/2303.08774"
 | 
				
			||||||
 | 
					    # txt = r"https://arxiv.org/abs/2303.12712"
 | 
				
			||||||
 | 
					    # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder"
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
 | 
					    for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
 | 
				
			||||||
 | 
				
			|||||||
@ -21,6 +21,7 @@ def set_forbidden_text(text, mask, pattern, flags=0):
 | 
				
			|||||||
    you can mask out (mask = PRESERVE so that text become untouchable for GPT) 
 | 
					    you can mask out (mask = PRESERVE so that text become untouchable for GPT) 
 | 
				
			||||||
    everything between "\begin{equation}" and "\end{equation}"
 | 
					    everything between "\begin{equation}" and "\end{equation}"
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(pattern, list): pattern = '|'.join(pattern)
 | 
				
			||||||
    pattern_compile = re.compile(pattern, flags)
 | 
					    pattern_compile = re.compile(pattern, flags)
 | 
				
			||||||
    for res in pattern_compile.finditer(text):
 | 
					    for res in pattern_compile.finditer(text):
 | 
				
			||||||
        mask[res.span()[0]:res.span()[1]] = PRESERVE
 | 
					        mask[res.span()[0]:res.span()[1]] = PRESERVE
 | 
				
			||||||
@ -46,7 +47,7 @@ def set_forbidden_text_careful_brace(text, mask, pattern, flags=0):
 | 
				
			|||||||
        mask[begin:end] = PRESERVE
 | 
					        mask[begin:end] = PRESERVE
 | 
				
			||||||
    return text, mask
 | 
					    return text, mask
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0):
 | 
					def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Move area out of preserve area (make text editable for GPT)
 | 
					    Move area out of preserve area (make text editable for GPT)
 | 
				
			||||||
    count the number of the braces so as to catch compelete text area. 
 | 
					    count the number of the braces so as to catch compelete text area. 
 | 
				
			||||||
@ -64,6 +65,9 @@ def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0):
 | 
				
			|||||||
            p += 1
 | 
					            p += 1
 | 
				
			||||||
        end = p
 | 
					        end = p
 | 
				
			||||||
        mask[begin:end] = TRANSFORM
 | 
					        mask[begin:end] = TRANSFORM
 | 
				
			||||||
 | 
					        if forbid_wrapper:
 | 
				
			||||||
 | 
					            mask[res.regs[0][0]:begin] = PRESERVE
 | 
				
			||||||
 | 
					            mask[end:res.regs[0][1]] = PRESERVE
 | 
				
			||||||
    return text, mask
 | 
					    return text, mask
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
 | 
					def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
 | 
				
			||||||
@ -163,6 +167,7 @@ def rm_comments(main_file):
 | 
				
			|||||||
        else:
 | 
					        else:
 | 
				
			||||||
            new_file_remove_comment_lines.append(l)
 | 
					            new_file_remove_comment_lines.append(l)
 | 
				
			||||||
    main_file = '\n'.join(new_file_remove_comment_lines)
 | 
					    main_file = '\n'.join(new_file_remove_comment_lines)
 | 
				
			||||||
 | 
					    # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file)  # 将 \include 命令转换为 \input 命令
 | 
				
			||||||
    main_file = re.sub(r'(?<!\\)%.*', '', main_file)  # 使用正则表达式查找半行注释, 并替换为空字符串
 | 
					    main_file = re.sub(r'(?<!\\)%.*', '', main_file)  # 使用正则表达式查找半行注释, 并替换为空字符串
 | 
				
			||||||
    return main_file
 | 
					    return main_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -209,9 +214,11 @@ def merge_tex_files(project_foler, main_file, mode):
 | 
				
			|||||||
        main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows,UTF8]{\2}",main_file)
 | 
					        main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows,UTF8]{\2}",main_file)
 | 
				
			||||||
        main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows,UTF8]{\1}",main_file)
 | 
					        main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows,UTF8]{\1}",main_file)
 | 
				
			||||||
        # find paper abstract
 | 
					        # find paper abstract
 | 
				
			||||||
        pattern = re.compile(r'\\begin\{abstract\}.*\n')
 | 
					        pattern_opt1 = re.compile(r'\\begin\{abstract\}.*\n')
 | 
				
			||||||
        match = pattern.search(main_file)
 | 
					        pattern_opt2 = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
 | 
				
			||||||
        assert match is not None, "Cannot find paper abstract section!"
 | 
					        match_opt1 = pattern_opt1.search(main_file)
 | 
				
			||||||
 | 
					        match_opt2 = pattern_opt2.search(main_file)
 | 
				
			||||||
 | 
					        assert (match_opt1 is not None) or (match_opt2 is not None), "Cannot find paper abstract section!"
 | 
				
			||||||
    return main_file
 | 
					    return main_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -293,48 +300,32 @@ def split_subprocess(txt, project_folder, return_dict, opts):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # 吸收title与作者以上的部分
 | 
					    # 吸收title与作者以上的部分
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL)
 | 
				
			||||||
    # 删除iffalse注释
 | 
					    # 吸收iffalse注释
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
 | 
				
			||||||
    # 吸收在25行以内的begin-end组合
 | 
					    # 吸收在25行以内的begin-end组合
 | 
				
			||||||
    text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
 | 
					    text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
 | 
				
			||||||
    # 吸收匿名公式
 | 
					    # 吸收匿名公式
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\$\$(.*?)\$\$", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [ r"\$\$(.*?)\$\$",  r"\\\[.*?\\\]" ], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\\[.*?\\\]", re.DOTALL)
 | 
					 | 
				
			||||||
    # 吸收其他杂项
 | 
					    # 吸收其他杂项
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\section\{(.*?)\}")
 | 
					    text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\section\*\{(.*?)\}")
 | 
					    text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\subsection\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\subsubsection\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\bibliography\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\bibliographystyle\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL)
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}"])
 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\includepdf\[(.*?)\]\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\item ")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\label\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\begin\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\vspace\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\hspace\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text(text, mask, r"\\end\{(.*?)\}")
 | 
					 | 
				
			||||||
    text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
 | 
					    text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
 | 
				
			||||||
    # reverse 操作必须放在最后
 | 
					    # reverse 操作必须放在最后
 | 
				
			||||||
    text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL)
 | 
					    text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
 | 
				
			||||||
 | 
					    text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
 | 
				
			||||||
    root = convert_to_linklist(text, mask)
 | 
					    root = convert_to_linklist(text, mask)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 修复括号
 | 
					    # 修复括号
 | 
				
			||||||
@ -408,7 +399,7 @@ def split_subprocess(txt, project_folder, return_dict, opts):
 | 
				
			|||||||
        prev_node = node
 | 
					        prev_node = node
 | 
				
			||||||
        node = node.next
 | 
					        node = node.next
 | 
				
			||||||
        if node is None: break
 | 
					        if node is None: break
 | 
				
			||||||
 | 
					    # 输出html调试文件,用红色标注处保留区(PRESERVE),用黑色标注转换区(TRANSFORM)
 | 
				
			||||||
    with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
 | 
					    with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
 | 
				
			||||||
        segment_parts_for_gpt = []
 | 
					        segment_parts_for_gpt = []
 | 
				
			||||||
        nodes = []
 | 
					        nodes = []
 | 
				
			||||||
@ -461,9 +452,13 @@ class LatexPaperSplit():
 | 
				
			|||||||
            pattern = re.compile(r'\\begin\{abstract\}.*\n')
 | 
					            pattern = re.compile(r'\\begin\{abstract\}.*\n')
 | 
				
			||||||
            match = pattern.search(result_string)
 | 
					            match = pattern.search(result_string)
 | 
				
			||||||
            if not match:
 | 
					            if not match:
 | 
				
			||||||
                pattern = re.compile(r'\\abstract\{')
 | 
					                # match \abstract{xxxx}
 | 
				
			||||||
                match = pattern.search(result_string)
 | 
					                pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
 | 
				
			||||||
            position = match.end()
 | 
					                match = pattern_compile.search(result_string)
 | 
				
			||||||
 | 
					                position = match.regs[1][0]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # match \begin{abstract}xxxx\end{abstract}
 | 
				
			||||||
 | 
					                position = match.end()
 | 
				
			||||||
            result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
 | 
					            result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
 | 
				
			||||||
        return result_string
 | 
					        return result_string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -482,6 +477,7 @@ class LatexPaperSplit():
 | 
				
			|||||||
            args=(txt, project_folder, return_dict, opts))
 | 
					            args=(txt, project_folder, return_dict, opts))
 | 
				
			||||||
        p.start()
 | 
					        p.start()
 | 
				
			||||||
        p.join()
 | 
					        p.join()
 | 
				
			||||||
 | 
					        p.close()
 | 
				
			||||||
        self.nodes = return_dict['nodes']
 | 
					        self.nodes = return_dict['nodes']
 | 
				
			||||||
        self.sp = return_dict['segment_parts_for_gpt']
 | 
					        self.sp = return_dict['segment_parts_for_gpt']
 | 
				
			||||||
        return self.sp
 | 
					        return self.sp
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user