建立本地arxiv缓存区
This commit is contained in:
parent
d5c9bc9f0a
commit
14269eba98
@ -1,7 +1,8 @@
|
||||
from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload
|
||||
from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload, promote_file_to_downloadzone
|
||||
from toolbox import CatchException, report_execption, update_ui_lastest_msg, zip_result, gen_time_str
|
||||
import glob, os, requests, time
|
||||
pj = os.path.join
|
||||
ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")
|
||||
|
||||
# =================================== 工具函数 ===============================================
|
||||
沙雕GPT啊别犯这些低级翻译错误 = 'You must to translate "agent" to "智能体". '
|
||||
@ -48,7 +49,7 @@ def desend_to_extracted_folder_if_exist(project_folder):
|
||||
if maybe_dir[0].endswith('.extract'): return maybe_dir[0]
|
||||
return project_folder
|
||||
|
||||
def move_project(project_folder):
|
||||
def move_project(project_folder, arxiv_id=None):
|
||||
"""
|
||||
Create a new work folder and copy the project folder to it.
|
||||
|
||||
@ -60,13 +61,26 @@ def move_project(project_folder):
|
||||
"""
|
||||
import shutil, time
|
||||
time.sleep(2) # avoid time string conflict
|
||||
new_workfolder = f'gpt_log/{gen_time_str()}'
|
||||
if arxiv_id is not None:
|
||||
new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
|
||||
else:
|
||||
new_workfolder = f'gpt_log/{gen_time_str()}'
|
||||
shutil.copytree(src=project_folder, dst=new_workfolder)
|
||||
return new_workfolder
|
||||
|
||||
def arxiv_download(chatbot, history, txt):
|
||||
def check_cached_translation_pdf(arxiv_id):
|
||||
translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation')
|
||||
if not os.path.exists(translation_dir):
|
||||
os.makedirs(translation_dir)
|
||||
target_file = pj(translation_dir, 'translate_zh.pdf')
|
||||
if os.path.exists(target_file):
|
||||
promote_file_to_downloadzone(target_file)
|
||||
return target_file
|
||||
return False
|
||||
|
||||
if not txt.startswith('https://arxiv.org'):
|
||||
return txt
|
||||
return txt, None
|
||||
|
||||
# <-------------- inspect format ------------->
|
||||
chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...'])
|
||||
@ -77,16 +91,19 @@ def arxiv_download(chatbot, history, txt):
|
||||
if not txt.startswith('https://arxiv.org/abs/'):
|
||||
msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}"
|
||||
yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面
|
||||
return msg
|
||||
|
||||
return msg, None
|
||||
# <-------------- set format ------------->
|
||||
arxiv_id = url_.split('/abs/')[-1]
|
||||
cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
|
||||
if cached_translation_pdf: return cached_translation_pdf, arxiv_id
|
||||
|
||||
url_tar = url_.replace('/abs/', '/e-print/')
|
||||
download_dir = './gpt_log/arxiv/'
|
||||
os.makedirs(download_dir, exist_ok=True)
|
||||
translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
|
||||
extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
|
||||
os.makedirs(translation_dir, exist_ok=True)
|
||||
|
||||
# <-------------- download arxiv source file ------------->
|
||||
dst = pj(download_dir, arxiv_id+'.tar')
|
||||
dst = pj(translation_dir, arxiv_id+'.tar')
|
||||
if os.path.exists(dst):
|
||||
yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面
|
||||
else:
|
||||
@ -98,9 +115,8 @@ def arxiv_download(chatbot, history, txt):
|
||||
# <-------------- extract file ------------->
|
||||
yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面
|
||||
from toolbox import extract_archive
|
||||
extract_dst = f'gpt_log/{gen_time_str()}'
|
||||
extract_archive(file_path=dst, dest_dir=extract_dst)
|
||||
return extract_dst
|
||||
return extract_dst, arxiv_id
|
||||
# ========================================= 插件主程序1 =====================================================
|
||||
|
||||
|
||||
@ -126,7 +142,6 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
|
||||
|
||||
# <-------------- clear history and read input ------------->
|
||||
history = []
|
||||
txt = yield from arxiv_download(chatbot, history, txt)
|
||||
if os.path.exists(txt):
|
||||
project_folder = txt
|
||||
else:
|
||||
@ -146,7 +161,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
|
||||
|
||||
|
||||
# <-------------- move latex project away from temp folder ------------->
|
||||
project_folder = move_project(project_folder)
|
||||
project_folder = move_project(project_folder, arxiv_id=None)
|
||||
|
||||
|
||||
# <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
|
||||
@ -197,7 +212,11 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
|
||||
|
||||
# <-------------- clear history and read input ------------->
|
||||
history = []
|
||||
txt = yield from arxiv_download(chatbot, history, txt)
|
||||
txt, arxiv_id = yield from arxiv_download(chatbot, history, txt)
|
||||
if txt.endswith('.pdf'):
|
||||
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
|
||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||
return
|
||||
if os.path.exists(txt):
|
||||
project_folder = txt
|
||||
else:
|
||||
@ -217,7 +236,7 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
|
||||
|
||||
|
||||
# <-------------- move latex project away from temp folder ------------->
|
||||
project_folder = move_project(project_folder)
|
||||
project_folder = move_project(project_folder, arxiv_id)
|
||||
|
||||
|
||||
# <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
|
||||
|
@ -190,7 +190,8 @@ def test_Latex():
|
||||
txt = r"C:\Users\fuqingxu\Desktop\旧文件\gpt\latex2pdf\2023-06-03-17-14-40"
|
||||
txt = r"https://arxiv.org/abs/2305.18290"
|
||||
txt = r"https://arxiv.org/abs/2305.17608"
|
||||
|
||||
# txt = r"https://arxiv.org/abs/2306.00324"
|
||||
txt = r"https://arxiv.org/abs/2211.16068"
|
||||
|
||||
for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
|
||||
cli_printer.print(cb) # print(cb)
|
||||
|
@ -1,5 +1,5 @@
|
||||
from toolbox import update_ui, update_ui_lastest_msg # 刷新Gradio前端界面
|
||||
from toolbox import zip_folder, objdump, objload
|
||||
from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
|
||||
import os, shutil
|
||||
import re
|
||||
pj = os.path.join
|
||||
@ -135,10 +135,10 @@ class LatexPaperSplit():
|
||||
match = pattern.search(result_string)
|
||||
position = match.end()
|
||||
result_string = result_string[:position] + \
|
||||
"\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \
|
||||
"{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成,其内容可靠性没有任何保障,请仔细鉴别并以原文为准。" + \
|
||||
"项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + \
|
||||
msg + \
|
||||
"为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}\\\\" + \
|
||||
"为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + \
|
||||
result_string[position:]
|
||||
except:
|
||||
pass
|
||||
@ -232,6 +232,8 @@ class LatexPaperSplit():
|
||||
split_worker(root, r"\\iffalse(.*?)\\fi", re.DOTALL)
|
||||
# 吸收在25行以内的begin-end组合
|
||||
split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
|
||||
# 吸收匿名公式
|
||||
split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL)
|
||||
# 吸收其他杂项
|
||||
split_worker(root, r"(.*?)\\maketitle", re.DOTALL)
|
||||
split_worker(root, r"\\section\{(.*?)\}")
|
||||
@ -257,7 +259,6 @@ class LatexPaperSplit():
|
||||
split_worker(root, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
|
||||
split_worker(root, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
|
||||
split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
|
||||
split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL)
|
||||
split_worker(root, r"\\item ")
|
||||
split_worker(root, r"\\label\{(.*?)\}")
|
||||
split_worker(root, r"\\begin\{(.*?)\}")
|
||||
@ -398,12 +399,6 @@ class LatexPaperFileGroup():
|
||||
manifest.append(path + '.polish.tex')
|
||||
f.write(res)
|
||||
return manifest
|
||||
|
||||
def zip_result(self):
|
||||
import os, time
|
||||
folder = os.path.dirname(self.file_paths[0])
|
||||
t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
|
||||
zip_folder(folder, './gpt_log/', f'{t}-polished.zip')
|
||||
|
||||
|
||||
|
||||
@ -525,13 +520,13 @@ def compile_latex_with_timeout(command, timeout=60):
|
||||
stdout, stderr = process.communicate()
|
||||
print("Process timed out!")
|
||||
return False
|
||||
print(stderr)
|
||||
return True
|
||||
|
||||
def 编译Latex差别(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder):
|
||||
import os, time
|
||||
current_dir = os.getcwd()
|
||||
n_fix = 0
|
||||
n_fix = 1
|
||||
max_try = 32
|
||||
chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,则大概率是卡死在Latex里面了。不幸卡死时请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
|
||||
chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
|
||||
yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
|
||||
@ -539,29 +534,31 @@ def 编译Latex差别(chatbot, history, main_file_original, main_file_modified,
|
||||
while True:
|
||||
import os
|
||||
# https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir)
|
||||
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir)
|
||||
|
||||
if ok:
|
||||
if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
|
||||
# 只有第二步成功,才能继续下面的步骤
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux'); os.chdir(current_dir)
|
||||
os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir)
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面
|
||||
if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')):
|
||||
os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux'); os.chdir(current_dir)
|
||||
if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')):
|
||||
os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux'); os.chdir(current_dir)
|
||||
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir)
|
||||
os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir)
|
||||
os.chdir(work_folder_original); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex'); os.chdir(current_dir)
|
||||
os.chdir(work_folder_modified); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex'); os.chdir(current_dir)
|
||||
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
|
||||
print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
|
||||
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
|
||||
|
||||
yield from update_ui_lastest_msg(f'尝试第{n_fix}次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir)
|
||||
os.chdir(work_folder); ok = compile_latex_with_timeout(f'bibtex merge_diff.aux'); os.chdir(current_dir)
|
||||
os.chdir(work_folder); ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex'); os.chdir(current_dir)
|
||||
@ -583,9 +580,13 @@ def 编译Latex差别(chatbot, history, main_file_original, main_file_modified,
|
||||
if modified_pdf_success:
|
||||
yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面
|
||||
os.chdir(current_dir)
|
||||
result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf')
|
||||
if os.path.exists(pj(work_folder, '..', 'translation')):
|
||||
shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf'))
|
||||
promote_file_to_downloadzone(result_pdf)
|
||||
return True # 成功啦
|
||||
else:
|
||||
if n_fix>=7: break
|
||||
if n_fix>=max_try: break
|
||||
n_fix += 1
|
||||
can_retry, main_file_modified, buggy_lines = remove_buggy_lines(
|
||||
file_path=pj(work_folder_modified, f'{main_file_modified}.tex'),
|
||||
|
@ -431,6 +431,13 @@ def find_recent_files(directory):
|
||||
|
||||
return recent_files
|
||||
|
||||
def promote_file_to_downloadzone(file, rename_file=None):
|
||||
# 将文件复制一份到下载区
|
||||
import shutil
|
||||
if rename_file is None: rename_file = f'{gen_time_str()}-{os.path.basename(file)}'
|
||||
new_path = os.path.join(f'./gpt_log/', rename_file)
|
||||
if os.path.exists(new_path): os.remove(new_path)
|
||||
shutil.copyfile(file, new_path)
|
||||
|
||||
def on_file_uploaded(files, chatbot, txt, txt2, checkboxes):
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user