diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index ecba82b..855cc1c 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -82,7 +82,14 @@ def arxiv_download(chatbot, history, txt): promote_file_to_downloadzone(target_file) return target_file return False - + def is_float(s): + try: + float(s) + return True + except ValueError: + return False + if ('.' in txt) and ('/' not in txt) and is_float(txt): + txt = 'https://arxiv.org/abs/' + txt if not txt.startswith('https://arxiv.org'): return txt, None @@ -198,7 +205,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- information about this plugin -------------> chatbot.append([ "函数插件功能?", - "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) + "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 @@ -221,6 +228,8 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + + if os.path.exists(txt): project_folder = txt else: @@ -228,6 +237,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] if len(file_manifest) == 0: report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") @@ -261,5 +271,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + # <-------------- we are done -------------> return success diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index d19d653..e743878 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -182,13 +182,13 @@ def test_Langchain知识库读取(): def test_Latex(): from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF - txt = r"https://arxiv.org/abs/1706.03762" + # txt = r"https://arxiv.org/abs/1706.03762" # txt = r"https://arxiv.org/abs/1902.03185" # txt = r"https://arxiv.org/abs/2305.18290" # txt = r"https://arxiv.org/abs/2305.17608" - # txt = r"https://arxiv.org/abs/2211.16068" # ACE - # txt = r"C:\Users\fuqingxu\arxiv_cache\2211.16068\workfolder" # ACE - + # txt = r"https://arxiv.org/abs/2211.16068" # ACE + # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE + txt = r"https://arxiv.org/abs/2002.09253" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index b490b5c..15dfebc 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -61,8 +61,8 @@ class LinkedListNode(): self.string = string self.preserve = preserve self.next = None - self.begin_line = 0 - self.begin_char = 0 + # self.begin_line = 0 + # self.begin_char = 0 def convert_to_linklist(text, mask): root = LinkedListNode("", preserve=True) @@ -97,11 +97,22 @@ def 寻找Latex主文件(file_manifest, mode): else: continue raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') - +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.startswith("%") or (l.startswith(" ") and l.lstrip().startswith("%")): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + main_file = re.sub(r'(? None: - """ - root是链表的根节点 - """ - self.root = None + self.nodes = None self.msg = "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ "版权归原文作者所有。翻译内容可靠性无任何保障,请仔细鉴别并以原文为准。" + \ "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" @@ -212,16 +212,13 @@ class LatexPaperSplit(): Merge the result after the GPT process completed """ result_string = "" - node = self.root p = 0 - while True: + for node in self.nodes: if node.preserve: result_string += node.string else: result_string += fix_content(arr[p], node.string) p += 1 - node = node.next - if node is None: break if mode == 'translate_zh': pattern = re.compile(r'\\begin\{abstract\}.*\n') match = pattern.search(result_string) @@ -229,7 +226,27 @@ class LatexPaperSplit(): result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] return result_string - def split(self, txt, project_folder): + def split(self, txt, project_folder): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + P.S. use multiprocessing to avoid timeout error + """ + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=lambda lps, txt, project_folder, return_dict: + lps.split_subprocess(txt, project_folder, return_dict), + args=(self, txt, project_folder, return_dict)) + p.start() + p.join() + self.nodes = return_dict['nodes'] + self.sp = return_dict['segment_parts_for_gpt'] + return self.sp + + def split_subprocess(self, txt, project_folder, return_dict): """ break down latex file to a linked list, each node use a preserve flag to indicate whether it should @@ -318,12 +335,20 @@ class LatexPaperSplit(): node = node.next if node is None: break + # 屏蔽空行和太短的句子 node = root while True: if len(node.string.strip('\n').strip(''))==0: node.preserve = True if len(node.string.strip('\n').strip(''))<42: node.preserve = True node = node.next if node is None: break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: break # 将前后断行符脱离 node = root @@ -345,8 +370,10 @@ class LatexPaperSplit(): with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: segment_parts_for_gpt = [] + nodes = [] node = root while True: + nodes.append(node) show_html = node.string.replace('\n','
') if not node.preserve: segment_parts_for_gpt.append(node.string) @@ -355,9 +382,11 @@ class LatexPaperSplit(): f.write(f'

{show_html}

') node = node.next if node is None: break - self.root = root - self.sp = segment_parts_for_gpt - return self.sp + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict class LatexPaperFileGroup(): """ @@ -439,7 +468,7 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin # <-------- 精细切分latex文件 ----------> lps = LatexPaperSplit() - res = lps.split(merged_content, project_folder) + res = lps.split(merged_content, project_folder) # 消耗时间的函数 # <-------- 拆分过长的latex片段 ----------> pfg = LatexPaperFileGroup() @@ -515,7 +544,8 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work f.writelines(file_lines) return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines except: - return False, 0, [0] + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] def compile_latex_with_timeout(command, timeout=60): diff --git a/docs/Dockerfile+NoLocal+Latex b/docs/Dockerfile+NoLocal+Latex index 428dbc0..0f9ac8a 100644 --- a/docs/Dockerfile+NoLocal+Latex +++ b/docs/Dockerfile+NoLocal+Latex @@ -8,26 +8,17 @@ FROM fuqingxu/python311_texlive_ctex:latest # 指定路径 WORKDIR /gpt +ARG useProxyNetwork='' + +RUN $useProxyNetwork pip3 install gradio openai numpy arxiv rich -i https://pypi.douban.com/simple/ +RUN $useProxyNetwork pip3 install colorama Markdown pygments pymupdf -i https://pypi.douban.com/simple/ + # 装载项目文件 COPY . . -ARG useProxyNetwork='' - - -# # # comment out below if you do not need proxy network | 如果不需要翻墙 - 从此行向下删除 -# RUN apt-get update -# RUN apt-get install -y curl proxychains -# RUN $useProxyNetwork curl cip.cc -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN sed -i '$ d' /etc/proxychains.conf -# RUN echo "socks5 127.0.0.1 10880" >> /etc/proxychains.conf -# ARG useProxyNetwork=proxychains -# # # comment out above if you do not need proxy network | 如果不需要翻墙 - 从此行向上删除 - - # 安装依赖 -RUN $useProxyNetwork pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple +RUN $useProxyNetwork pip3 install -r requirements.txt -i https://pypi.douban.com/simple/ # 可选步骤,用于预热模块 RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'