diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 99672e8..b490b5c 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -7,26 +7,36 @@ pj = os.path.join """ ======================================================================== -第一部分 -Latex 文件切分到一个链表中 +Part One +Latex segmentation to a linklist ======================================================================== """ PRESERVE = 0 TRANSFORM = 1 def split_worker(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper + """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): mask[res.span()[0]:res.span()[1]] = PRESERVE return text, mask def split_worker_reverse_caption(text, mask, pattern, flags=0): + """ + Move caption area out of preserve area + """ pattern_compile = re.compile(pattern, flags) for res in pattern_compile.finditer(text): mask[res.regs[1][0]:res.regs[1][1]] = TRANSFORM return text, mask -def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=25): +def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): + """ + Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. + Add it to preserve area + """ pattern_compile = re.compile(pattern, flags) def search_with_line_limit(text, mask): for res in pattern_compile.finditer(text): @@ -35,7 +45,7 @@ def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=25): this_mask = mask[res.regs[2][0]:res.regs[2][1]] white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] - if (cmd in white_list) or this.count('\n') >= 42: # use a magical number 42 + if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42 this, this_mask = search_with_line_limit(this, this_mask) mask[res.regs[2][0]:res.regs[2][1]] = this_mask else: @@ -45,7 +55,7 @@ def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=25): class LinkedListNode(): """ - 链表单元 + Linked List Node """ def __init__(self, string, preserve=True) -> None: self.string = string @@ -68,7 +78,7 @@ def convert_to_linklist(text, mask): return root """ ======================================================================== -Latex 文件融合 +Latex Merge File ======================================================================== """ @@ -90,7 +100,7 @@ def 寻找Latex主文件(file_manifest, mode): def merge_tex_files_(project_foler, main_file, mode): """ - 递归地把多Tex工程整合为一个Tex文档 + Merge Tex project recrusively """ for s in reversed([q for q in re.finditer(r"\\input\{(.*?)\}", main_file, re.M)]): f = s.group(1) @@ -109,7 +119,7 @@ def merge_tex_files_(project_foler, main_file, mode): def merge_tex_files(project_foler, main_file, mode): """ - 递归地把多Tex工程整合为一个Tex文档(递归外层) + Merge Tex project recrusively P.S. 顺便把CTEX塞进去以支持中文 P.S. 顺便把Latex的注释去除 """ @@ -142,7 +152,7 @@ def merge_tex_files(project_foler, main_file, mode): """ ======================================================================== -后处理 +Post process ======================================================================== """ def mod_inbraket(match): @@ -182,7 +192,9 @@ def fix_content(final_tex, node_string): class LatexPaperSplit(): """ - 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. """ def __init__(self) -> None: """ @@ -192,11 +204,12 @@ class LatexPaperSplit(): self.msg = "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ "版权归原文作者所有。翻译内容可靠性无任何保障,请仔细鉴别并以原文为准。" + \ "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" def merge_result(self, arr, mode, msg): """ - 将GPT处理后的结果融合 + Merge the result after the GPT process completed """ result_string = "" node = self.root @@ -218,7 +231,9 @@ class LatexPaperSplit(): def split(self, txt, project_folder): """ - 将Latex文档分解到一个链表中,每个链表节点用preserve的标志位提示它是否应当被GPT处理 + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. """ text = txt mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM @@ -263,19 +278,7 @@ class LatexPaperSplit(): text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) root = convert_to_linklist(text, mask) - # 将分解结果返回 res_to_t - with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - res_to_t = [] - node = root - while True: - show_html = node.string.replace('\n','
') - if not node.preserve: - res_to_t.append(node.string) - f.write(f'

#{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break + # 修复括号 node = root while True: @@ -340,25 +343,26 @@ class LatexPaperSplit(): node = node.next if node is None: break - # 将分解结果返回 res_to_t with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - res_to_t = [] + segment_parts_for_gpt = [] node = root while True: show_html = node.string.replace('\n','
') if not node.preserve: - res_to_t.append(node.string) + segment_parts_for_gpt.append(node.string) f.write(f'

#{show_html}#

') else: f.write(f'

{show_html}

') node = node.next if node is None: break - self.root = root - self.sp = res_to_t + self.sp = segment_parts_for_gpt return self.sp class LatexPaperFileGroup(): + """ + use tokenizer to break down text according to max_token_limit + """ def __init__(self): self.file_paths = [] self.file_contents = [] @@ -374,7 +378,7 @@ class LatexPaperFileGroup(): def run_file_split(self, max_token_limit=1900): """ - 将长文本分离开来 + use tokenizer to break down text according to max_token_limit """ for index, file_content in enumerate(self.file_contents): if self.get_token_num(file_content) < max_token_limit: