From 8ef734410160f2b8090a2ec10b15069ee60da9b7 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Tue, 6 Jun 2023 18:57:52 +0800 Subject: [PATCH] fix subprocess bug in Windows --- crazy_functions/latex_utils.py | 288 +++++++++++++++++---------------- 1 file changed, 145 insertions(+), 143 deletions(-) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 15dfebc..d3d7b9c 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -192,6 +192,149 @@ def fix_content(final_tex, node_string): final_tex = node_string # 出问题了,还原原文 return final_tex +def split_subprocess(txt, project_folder, return_dict): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + """ + text = txt + mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM + + # 吸收title与作者以上的部分 + text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) + # 删除iffalse注释 + text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) + # 吸收在25行以内的begin-end组合 + text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) + # 吸收匿名公式 + text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) + # 吸收其他杂项 + text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\item ") + text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") + # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) + root = convert_to_linklist(text, mask) + + # 修复括号 + node = root + while True: + string = node.string + if node.preserve: + node = node.next + if node is None: break + continue + def break_check(string): + str_stack = [""] # (lv, index) + for i, c in enumerate(string): + if c == '{': + str_stack.append('{') + elif c == '}': + if len(str_stack) == 1: + print('stack fix') + return i + str_stack.pop(-1) + else: + str_stack[-1] += c + return -1 + bp = break_check(string) + + if bp == -1: + pass + elif bp == 0: + node.string = string[:1] + q = LinkedListNode(string[1:], False) + q.next = node.next + node.next = q + else: + node.string = string[:bp] + q = LinkedListNode(string[bp:], False) + q.next = node.next + node.next = q + + node = node.next + if node is None: break + + # 屏蔽空行和太短的句子 + node = root + while True: + if len(node.string.strip('\n').strip(''))==0: node.preserve = True + if len(node.string.strip('\n').strip(''))<42: node.preserve = True + node = node.next + if node is None: break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: break + + # 将前后断行符脱离 + node = root + prev_node = None + while True: + if not node.preserve: + lstriped_ = node.string.lstrip().lstrip('\n') + if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): + prev_node.string += node.string[:-len(lstriped_)] + node.string = lstriped_ + rstriped_ = node.string.rstrip().rstrip('\n') + if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): + node.next.string = node.string[len(rstriped_):] + node.next.string + node.string = rstriped_ + # ===== + prev_node = node + node = node.next + if node is None: break + + with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: + segment_parts_for_gpt = [] + nodes = [] + node = root + while True: + nodes.append(node) + show_html = node.string.replace('\n','
') + if not node.preserve: + segment_parts_for_gpt.append(node.string) + f.write(f'

#{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict + + class LatexPaperSplit(): """ @@ -237,156 +380,15 @@ class LatexPaperSplit(): manager = multiprocessing.Manager() return_dict = manager.dict() p = multiprocessing.Process( - target=lambda lps, txt, project_folder, return_dict: - lps.split_subprocess(txt, project_folder, return_dict), - args=(self, txt, project_folder, return_dict)) + target=split_subprocess, + args=(txt, project_folder, return_dict)) p.start() p.join() self.nodes = return_dict['nodes'] self.sp = return_dict['segment_parts_for_gpt'] return self.sp - def split_subprocess(self, txt, project_folder, return_dict): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - """ - text = txt - mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM - # 吸收title与作者以上的部分 - text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) - # 删除iffalse注释 - text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) - # 吸收在25行以内的begin-end组合 - text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) - # 吸收匿名公式 - text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) - # 吸收其他杂项 - text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\item ") - text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") - # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) - root = convert_to_linklist(text, mask) - - # 修复括号 - node = root - while True: - string = node.string - if node.preserve: - node = node.next - if node is None: break - continue - def break_check(string): - str_stack = [""] # (lv, index) - for i, c in enumerate(string): - if c == '{': - str_stack.append('{') - elif c == '}': - if len(str_stack) == 1: - print('stack fix') - return i - str_stack.pop(-1) - else: - str_stack[-1] += c - return -1 - bp = break_check(string) - - if bp == -1: - pass - elif bp == 0: - node.string = string[:1] - q = LinkedListNode(string[1:], False) - q.next = node.next - node.next = q - else: - node.string = string[:bp] - q = LinkedListNode(string[bp:], False) - q.next = node.next - node.next = q - - node = node.next - if node is None: break - - # 屏蔽空行和太短的句子 - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<42: node.preserve = True - node = node.next - if node is None: break - node = root - while True: - if node.next and node.preserve and node.next.preserve: - node.string += node.next.string - node.next = node.next.next - node = node.next - if node is None: break - - # 将前后断行符脱离 - node = root - prev_node = None - while True: - if not node.preserve: - lstriped_ = node.string.lstrip().lstrip('\n') - if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): - prev_node.string += node.string[:-len(lstriped_)] - node.string = lstriped_ - rstriped_ = node.string.rstrip().rstrip('\n') - if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): - node.next.string = node.string[len(rstriped_):] + node.next.string - node.string = rstriped_ - # ===== - prev_node = node - node = node.next - if node is None: break - - with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - segment_parts_for_gpt = [] - nodes = [] - node = root - while True: - nodes.append(node) - show_html = node.string.replace('\n','
') - if not node.preserve: - segment_parts_for_gpt.append(node.string) - f.write(f'

#{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break - - for n in nodes: n.next = None # break - return_dict['nodes'] = nodes - return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt - return return_dict class LatexPaperFileGroup(): """