From 14269eba98112f192455d931316c2f3bb7d95608 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Sun, 4 Jun 2023 16:08:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BB=BA=E7=AB=8B=E6=9C=AC=E5=9C=B0arxiv?= =?UTF-8?q?=E7=BC=93=E5=AD=98=E5=8C=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/Latex输出PDF结果.py | 49 +++++++++++++++++-------- crazy_functions/crazy_functions_test.py | 3 +- crazy_functions/latex_utils.py | 45 ++++++++++++----------- toolbox.py | 7 ++++ 4 files changed, 66 insertions(+), 38 deletions(-) diff --git a/crazy_functions/Latex输出PDF结果.py b/crazy_functions/Latex输出PDF结果.py index 7d2e796..087cab8 100644 --- a/crazy_functions/Latex输出PDF结果.py +++ b/crazy_functions/Latex输出PDF结果.py @@ -1,7 +1,8 @@ -from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload +from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload, promote_file_to_downloadzone from toolbox import CatchException, report_execption, update_ui_lastest_msg, zip_result, gen_time_str import glob, os, requests, time pj = os.path.join +ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/") # =================================== 工具函数 =============================================== 沙雕GPT啊别犯这些低级翻译错误 = 'You must to translate "agent" to "智能体". ' @@ -48,7 +49,7 @@ def desend_to_extracted_folder_if_exist(project_folder): if maybe_dir[0].endswith('.extract'): return maybe_dir[0] return project_folder -def move_project(project_folder): +def move_project(project_folder, arxiv_id=None): """ Create a new work folder and copy the project folder to it. @@ -60,13 +61,26 @@ def move_project(project_folder): """ import shutil, time time.sleep(2) # avoid time string conflict - new_workfolder = f'gpt_log/{gen_time_str()}' + if arxiv_id is not None: + new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') + else: + new_workfolder = f'gpt_log/{gen_time_str()}' shutil.copytree(src=project_folder, dst=new_workfolder) return new_workfolder def arxiv_download(chatbot, history, txt): + def check_cached_translation_pdf(arxiv_id): + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation') + if not os.path.exists(translation_dir): + os.makedirs(translation_dir) + target_file = pj(translation_dir, 'translate_zh.pdf') + if os.path.exists(target_file): + promote_file_to_downloadzone(target_file) + return target_file + return False + if not txt.startswith('https://arxiv.org'): - return txt + return txt, None # <-------------- inspect format -------------> chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...']) @@ -77,16 +91,19 @@ def arxiv_download(chatbot, history, txt): if not txt.startswith('https://arxiv.org/abs/'): msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}" yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面 - return msg - + return msg, None # <-------------- set format -------------> arxiv_id = url_.split('/abs/')[-1] + cached_translation_pdf = check_cached_translation_pdf(arxiv_id) + if cached_translation_pdf: return cached_translation_pdf, arxiv_id + url_tar = url_.replace('/abs/', '/e-print/') - download_dir = './gpt_log/arxiv/' - os.makedirs(download_dir, exist_ok=True) + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print') + extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract') + os.makedirs(translation_dir, exist_ok=True) # <-------------- download arxiv source file -------------> - dst = pj(download_dir, arxiv_id+'.tar') + dst = pj(translation_dir, arxiv_id+'.tar') if os.path.exists(dst): yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面 else: @@ -98,9 +115,8 @@ def arxiv_download(chatbot, history, txt): # <-------------- extract file -------------> yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面 from toolbox import extract_archive - extract_dst = f'gpt_log/{gen_time_str()}' extract_archive(file_path=dst, dest_dir=extract_dst) - return extract_dst + return extract_dst, arxiv_id # ========================================= 插件主程序1 ===================================================== @@ -126,7 +142,6 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo # <-------------- clear history and read input -------------> history = [] - txt = yield from arxiv_download(chatbot, history, txt) if os.path.exists(txt): project_folder = txt else: @@ -146,7 +161,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo # <-------------- move latex project away from temp folder -------------> - project_folder = move_project(project_folder) + project_folder = move_project(project_folder, arxiv_id=None) # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> @@ -197,7 +212,11 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- clear history and read input -------------> history = [] - txt = yield from arxiv_download(chatbot, history, txt) + txt, arxiv_id = yield from arxiv_download(chatbot, history, txt) + if txt.endswith('.pdf'): + report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return if os.path.exists(txt): project_folder = txt else: @@ -217,7 +236,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- move latex project away from temp folder -------------> - project_folder = move_project(project_folder) + project_folder = move_project(project_folder, arxiv_id) # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py index 4852948..d4e3274 100644 --- a/crazy_functions/crazy_functions_test.py +++ b/crazy_functions/crazy_functions_test.py @@ -190,7 +190,8 @@ def test_Latex(): txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40" txt = r"https://arxiv.org/abs/2305.18290" txt = r"https://arxiv.org/abs/2305.17608" - + # txt = r"https://arxiv.org/abs/2306.00324" + txt = r"https://arxiv.org/abs/2211.16068" for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): cli_printer.print(cb) # print(cb) diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py index 3faf6b2..3e128eb 100644 --- a/crazy_functions/latex_utils.py +++ b/crazy_functions/latex_utils.py @@ -1,5 +1,5 @@ from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界面 -from toolbox import zip_folder, objdump, objload +from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone import os, shutil import re pj = os.path.join @@ -135,10 +135,10 @@ class LatexPaperSplit(): match = pattern.search(result_string) position = match.end() result_string = result_string[:position] + \ - "\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \ + "{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \ "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + \ msg + \ - "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}\\\\" + \ + "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + \ result_string[position:] except: pass @@ -232,6 +232,8 @@ class LatexPaperSplit(): split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL) # 吸收在25行以内的begin-end组合 split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25) + # 吸收匿名公式 + split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL) # 吸收其他杂项 split_worker(root, r"(.*?)\\maketitle", re.DOTALL) split_worker(root, r"\\section\{(.*?)\}") @@ -257,7 +259,6 @@ class LatexPaperSplit(): split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL) split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL) split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) - split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL) split_worker(root, r"\\item ") split_worker(root, r"\\label\{(.*?)\}") split_worker(root, r"\\begin\{(.*?)\}") @@ -398,12 +399,6 @@ class LatexPaperFileGroup(): manifest.append(path + '.polish.tex') f.write(res) return manifest - - def zip_result(self): - import os, time - folder = os.path.dirname(self.file_paths[0]) - t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) - zip_folder(folder, './gpt_log/', f'{t}-polished.zip') @@ -525,13 +520,13 @@ def compile_latex_with_timeout(command, timeout=60): stdout, stderr = process.communicate() print("Process timed out!") return False - print(stderr) return True def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder): import os, time current_dir = os.getcwd() - n_fix = 0 + n_fix = 1 + max_try = 32 chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,则大概率是卡死在Latex里面了。不幸卡死时请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 @@ -539,29 +534,31 @@ def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, while True: import os # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) - if ok: + if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): # 只有第二步成功,才能继续下面的步骤 - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 - os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux'); os.chdir(current_dir) - os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir) + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 + if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): + os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux'); os.chdir(current_dir) + if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): + os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir) - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir) os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir) - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') - yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir) os.chdir(work_folder); ok = compile_latex_with_timeout(f'bibtex merge_diff.aux'); os.chdir(current_dir) os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir) @@ -583,9 +580,13 @@ def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, if modified_pdf_success: yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面 os.chdir(current_dir) + result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') + if os.path.exists(pj(work_folder, '..', 'translation')): + shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf')) + promote_file_to_downloadzone(result_pdf) return True # 成功啦 else: - if n_fix>=7: break + if n_fix>=max_try: break n_fix += 1 can_retry, main_file_modified, buggy_lines = remove_buggy_lines( file_path=pj(work_folder_modified, f'{main_file_modified}.tex'), diff --git a/toolbox.py b/toolbox.py index b4bcf82..18915d0 100644 --- a/toolbox.py +++ b/toolbox.py @@ -431,6 +431,13 @@ def find_recent_files(directory): return recent_files +def promote_file_to_downloadzone(file, rename_file=None): + # 将文件复制一份到下载区 + import shutil + if rename_file is None: rename_file = f'{gen_time_str()}-{os.path.basename(file)}' + new_path = os.path.join(f'./gpt_log/', rename_file) + if os.path.exists(new_path): os.remove(new_path) + shutil.copyfile(file, new_path) def on_file_uploaded(files, chatbot, txt, txt2, checkboxes): """