From 40da1b0afefd42bb27255e6980ac9b3fd43d7654 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Tue, 6 Jun 2023 18:44:00 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E5=B0=86Latex=E5=88=86=E8=A7=A3=E7=A8=8B?= =?UTF-8?q?=E5=BA=8F=E6=94=BE=E5=88=B0=E5=AD=90=E8=BF=9B=E7=A8=8B=E6=89=A7?= =?UTF-8?q?=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/Latex输出PDF结果.py | 15 ++++- crazy_functions/crazy_functions_test.py | 8 +-- crazy_functions/latex_utils.py | 84 +++++++++++++++++-------- docs/Dockerfile+NoLocal+Latex | 21 ++----- 4 files changed, 80 insertions(+), 48 deletions(-) diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index ecba82b..855cc1c 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -82,7 +82,14 @@ def arxiv_download(chatbot, history, txt): promote_file_to_downloadzone(target_file) return target_file return False - + def is_float(s): + try: + float(s) + return True + except ValueError: + return False + if ('.' in txt) and ('/' not in txt) and is_float(txt): + txt = 'https://arxiv.org/abs/' + txt if not txt.startswith('https://arxiv.org'): return txt, None @@ -198,7 +205,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- information about this plugin -------------> chatbot.append([ "函数插件功能?", - "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) + "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 @@ -221,6 +228,8 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + + if os.path.exists(txt): project_folder = txt else: @@ -228,6 +237,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] if len(file_manifest) == 0: report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") @@ -261,5 +271,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + # <-------------- we are done -------------> return success diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index d19d653..e743878 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -182,13 +182,13 @@ def test_Langchain知识库读取(): def test_Latex(): from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF - txt = r"https://arxiv.org/abs/1706.03762" + # txt = r"https://arxiv.org/abs/1706.03762" # txt = r"https://arxiv.org/abs/1902.03185" # txt = r"https://arxiv.org/abs/2305.18290" # txt = r"https://arxiv.org/abs/2305.17608" - # txt = r"https://arxiv.org/abs/2211.16068" # ACE - # txt = r"C:\Users\fuqingxu\arxiv_cache\2211.16068\workfolder" # ACE - + # txt = r"https://arxiv.org/abs/2211.16068" # ACE + # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE + txt = r"https://arxiv.org/abs/2002.09253" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index b490b5c..15dfebc 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -61,8 +61,8 @@ class LinkedListNode(): self.string = string self.preserve = preserve self.next = None - self.begin_line = 0 - self.begin_char = 0 + # self.begin_line = 0 + # self.begin_char = 0 def convert_to_linklist(text, mask): root = LinkedListNode("", preserve=True) @@ -97,11 +97,22 @@ def 寻找Latex主文件(file_manifest, mode): else: continue raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') - +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + main_file = re.sub(r'(? None: - """ - root是链表的根节点 - """ - self.root = None + self.nodes = None self.msg = "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ "版权归原文作者所有。翻译内容可靠性无任何保障,请仔细鉴别并以原文为准。" + \ "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" @@ -212,16 +212,13 @@ class LatexPaperSplit(): Merge the result after the GPT process completed """ result_string = "" - node = self.root p = 0 - while True: + for node in self.nodes: if node.preserve: result_string += node.string else: result_string += fix_content(arr[p], node.string) p += 1 - node = node.next - if node is None: break if mode == 'translate_zh': pattern = re.compile(r'\\begin\{abstract\}.*\n') match = pattern.search(result_string) @@ -229,7 +226,27 @@ class LatexPaperSplit(): result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] return result_string - def split(self, txt, project_folder): + def split(self, txt, project_folder): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + P.S. use multiprocessing to avoid timeout error + """ + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=lambda lps, txt, project_folder, return_dict: + lps.split_subprocess(txt, project_folder, return_dict), + args=(self, txt, project_folder, return_dict)) + p.start() + p.join() + self.nodes = return_dict['nodes'] + self.sp = return_dict['segment_parts_for_gpt'] + return self.sp + + def split_subprocess(self, txt, project_folder, return_dict): """ break down latex file to a linked list, each node use a preserve flag to indicate whether it should @@ -318,12 +335,20 @@ class LatexPaperSplit(): node = node.next if node is None: break + # 屏蔽空行和太短的句子 node = root while True: if len(node.string.strip('\n').strip(''))==0: node.preserve = True if len(node.string.strip('\n').strip(''))<42: node.preserve = True node = node.next if node is None: break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: break # 将前后断行符脱离 node = root @@ -345,8 +370,10 @@ class LatexPaperSplit(): with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: segment_parts_for_gpt = [] + nodes = [] node = root while True: + nodes.append(node) show_html = node.string.replace('\n','
') if not node.preserve: segment_parts_for_gpt.append(node.string) @@ -355,9 +382,11 @@ class LatexPaperSplit(): f.write(f'

{show_html}

') node = node.next if node is None: break - self.root = root - self.sp = segment_parts_for_gpt - return self.sp + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict class LatexPaperFileGroup(): """ @@ -439,7 +468,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin # <-------- 精细切分latex文件 ----------> lps = LatexPaperSplit() - res = lps.split(merged_content, project_folder) + res = lps.split(merged_content, project_folder) # 消耗时间的函数 # <-------- 拆分过长的latex片段 ----------> pfg = LatexPaperFileGroup() @@ -515,7 +544,8 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work f.writelines(file_lines) return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines except: - return False, 0, [0] + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] def compile_latex_with_timeout(command, timeout=60): diff --git a/docs/Dockerfile+NoLocal+Latex b/docs/Dockerfile+NoLocal+Latex index 428dbc0..0f9ac8a 100644 --- a/docs/Dockerfile+NoLocal+Latex +++ b/docs/Dockerfile+NoLocal+Latex @@ -8,26 +8,17 @@ FROM fuqingxu/python311_texlive_ctex:latest # 指定路径 WORKDIR /gpt +ARG useProxyNetwork='' + +RUN $useProxyNetwork pip3 install gradio openai numpy arxiv rich -i https://pypi.douban.com/simple/ +RUN $useProxyNetwork pip3 install colorama Markdown pygments pymupdf -i https://pypi.douban.com/simple/ + # 装载项目文件 COPY . . -ARG useProxyNetwork='' - - -# # # comment out below if you do not need proxy network | 如果不需要翻墙 - 从此行向下删除 -# RUN apt-get update -# RUN apt-get install -y curl proxychains -# RUN $useProxyNetwork curl cip.cc -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN echo "socks5 127.0.0.1 10880" >> /etc/proxychains.conf -# ARG useProxyNetwork=proxychains -# # # comment out above if you do not need proxy network | 如果不需要翻墙 - 从此行向上删除 - - # 安装依赖 -RUN $useProxyNetwork pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple +RUN $useProxyNetwork pip3 install -r requirements.txt -i https://pypi.douban.com/simple/ # 可选步骤,用于预热模块 RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' From 8ef734410160f2b8090a2ec10b15069ee60da9b7 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Tue, 6 Jun 2023 18:57:52 +0800 Subject: [PATCH 2/2] fix subprocess bug in Windows --- crazy_functions/latex_utils.py | 288 +++++++++++++++++---------------- 1 file changed, 145 insertions(+), 143 deletions(-) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 15dfebc..d3d7b9c 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -192,6 +192,149 @@ def fix_content(final_tex, node_string): final_tex = node_string # 出问题了,还原原文 return final_tex +def split_subprocess(txt, project_folder, return_dict): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + """ + text = txt + mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM + + # 吸收title与作者以上的部分 + text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) + # 删除iffalse注释 + text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) + # 吸收在25行以内的begin-end组合 + text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) + # 吸收匿名公式 + text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) + # 吸收其他杂项 + text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) + text, mask = split_worker(text, mask, r"\\item ") + text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") + text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") + # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) + root = convert_to_linklist(text, mask) + + # 修复括号 + node = root + while True: + string = node.string + if node.preserve: + node = node.next + if node is None: break + continue + def break_check(string): + str_stack = [""] # (lv, index) + for i, c in enumerate(string): + if c == '{': + str_stack.append('{') + elif c == '}': + if len(str_stack) == 1: + print('stack fix') + return i + str_stack.pop(-1) + else: + str_stack[-1] += c + return -1 + bp = break_check(string) + + if bp == -1: + pass + elif bp == 0: + node.string = string[:1] + q = LinkedListNode(string[1:], False) + q.next = node.next + node.next = q + else: + node.string = string[:bp] + q = LinkedListNode(string[bp:], False) + q.next = node.next + node.next = q + + node = node.next + if node is None: break + + # 屏蔽空行和太短的句子 + node = root + while True: + if len(node.string.strip('\n').strip(''))==0: node.preserve = True + if len(node.string.strip('\n').strip(''))<42: node.preserve = True + node = node.next + if node is None: break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: break + + # 将前后断行符脱离 + node = root + prev_node = None + while True: + if not node.preserve: + lstriped_ = node.string.lstrip().lstrip('\n') + if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): + prev_node.string += node.string[:-len(lstriped_)] + node.string = lstriped_ + rstriped_ = node.string.rstrip().rstrip('\n') + if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): + node.next.string = node.string[len(rstriped_):] + node.next.string + node.string = rstriped_ + # ===== + prev_node = node + node = node.next + if node is None: break + + with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: + segment_parts_for_gpt = [] + nodes = [] + node = root + while True: + nodes.append(node) + show_html = node.string.replace('\n','
') + if not node.preserve: + segment_parts_for_gpt.append(node.string) + f.write(f'

#{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict + + class LatexPaperSplit(): """ @@ -237,156 +380,15 @@ class LatexPaperSplit(): manager = multiprocessing.Manager() return_dict = manager.dict() p = multiprocessing.Process( - target=lambda lps, txt, project_folder, return_dict: - lps.split_subprocess(txt, project_folder, return_dict), - args=(self, txt, project_folder, return_dict)) + target=split_subprocess, + args=(txt, project_folder, return_dict)) p.start() p.join() self.nodes = return_dict['nodes'] self.sp = return_dict['segment_parts_for_gpt'] return self.sp - def split_subprocess(self, txt, project_folder, return_dict): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - """ - text = txt - mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM - # 吸收title与作者以上的部分 - text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL) - # 删除iffalse注释 - text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) - # 吸收在25行以内的begin-end组合 - text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) - # 吸收匿名公式 - text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL) - # 吸收其他杂项 - text, mask = split_worker(text, mask, r"\\section\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - text, mask = split_worker(text, mask, r"\\item ") - text, mask = split_worker(text, mask, r"\\label\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}") - text, mask = split_worker(text, mask, r"\\end\{(.*?)\}") - # text, mask = split_worker_reverse_caption(text, mask, r"\\caption\{(.*?)\}", re.DOTALL) - root = convert_to_linklist(text, mask) - - # 修复括号 - node = root - while True: - string = node.string - if node.preserve: - node = node.next - if node is None: break - continue - def break_check(string): - str_stack = [""] # (lv, index) - for i, c in enumerate(string): - if c == '{': - str_stack.append('{') - elif c == '}': - if len(str_stack) == 1: - print('stack fix') - return i - str_stack.pop(-1) - else: - str_stack[-1] += c - return -1 - bp = break_check(string) - - if bp == -1: - pass - elif bp == 0: - node.string = string[:1] - q = LinkedListNode(string[1:], False) - q.next = node.next - node.next = q - else: - node.string = string[:bp] - q = LinkedListNode(string[bp:], False) - q.next = node.next - node.next = q - - node = node.next - if node is None: break - - # 屏蔽空行和太短的句子 - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<42: node.preserve = True - node = node.next - if node is None: break - node = root - while True: - if node.next and node.preserve and node.next.preserve: - node.string += node.next.string - node.next = node.next.next - node = node.next - if node is None: break - - # 将前后断行符脱离 - node = root - prev_node = None - while True: - if not node.preserve: - lstriped_ = node.string.lstrip().lstrip('\n') - if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): - prev_node.string += node.string[:-len(lstriped_)] - node.string = lstriped_ - rstriped_ = node.string.rstrip().rstrip('\n') - if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): - node.next.string = node.string[len(rstriped_):] + node.next.string - node.string = rstriped_ - # ===== - prev_node = node - node = node.next - if node is None: break - - with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - segment_parts_for_gpt = [] - nodes = [] - node = root - while True: - nodes.append(node) - show_html = node.string.replace('\n','
') - if not node.preserve: - segment_parts_for_gpt.append(node.string) - f.write(f'

#{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break - - for n in nodes: n.next = None # break - return_dict['nodes'] = nodes - return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt - return return_dict class LatexPaperFileGroup(): """