Merge pull request #1255 from xiangsam/master

[Feature] 更新精准翻译PDF文档(NOUGAT)插件
This commit is contained in:
binary-husky 2023-11-11 14:07:22 +08:00 committed by GitHub
commit fcf04554c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 15 deletions

View File

@ -748,7 +748,7 @@ class nougat_interface():
yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度正在加载NOUGAT... 提示首次运行需要花费较长时间下载NOUGAT参数",
chatbot=chatbot, history=history, delay=0)
self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}" --recompute --no-skipping --markdown --batchsize 8', os.getcwd(), timeout=3600)
res = glob.glob(os.path.join(dst,'*.mmd'))
if len(res) == 0:
self.threadLock.release()

View File

@ -57,6 +57,12 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
"批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 清空历史,以免输入溢出
history = []
from .crazy_utils import get_files_from_everything
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
if len(file_manifest) > 0:
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import nougat
@ -67,12 +73,11 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade nougat-ocr tiktoken```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
from .crazy_utils import get_files_from_everything
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
success_mmd, file_manifest_mmd, _ = get_files_from_everything(txt, type='.mmd')
success = success or success_mmd
file_manifest += file_manifest_mmd
chatbot.append(["文件列表:", ", ".join([e.split('/')[-1] for e in file_manifest])]);
yield from update_ui( chatbot=chatbot, history=history)
# 检测输入参数,如没有给定输入参数,直接退出
if not success:
if txt == "": txt = '空空如也的输入栏'
@ -101,9 +106,13 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
from crazy_functions.pdf_fns.report_gen_html import construct_html
nougat_handle = nougat_interface()
for index, fp in enumerate(file_manifest):
if fp.endswith('pdf'):
chatbot.append(["当前进度:", f"正在解析论文请稍候。第一次运行时需要花费较长时间下载NOUGAT参数"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history)
promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot)
else:
chatbot.append(["当前论文无需解析:", fp]); yield from update_ui( chatbot=chatbot, history=history)
fpp = fp
with open(fpp, 'r', encoding='utf8') as f:
article_content = f.readlines()
article_dict = markdown_to_dict(article_content)