Merge pull request #1255 from xiangsam/master

[Feature] 更新精准翻译PDF文档(NOUGAT)插件
2023-11-11 14:07:22 +08:00 · 2023-11-11 14:07:22 +08:00 · fcf04554c6
commit fcf04554c6
parent 107ea868e1 362b545a45
2 changed files with 24 additions and 15 deletions
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@ -748,7 +748,7 @@ class nougat_interface():
        yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度：正在加载NOUGAT... （提示：首次运行需要花费较长时间下载NOUGAT参数）", 
                                         chatbot=chatbot, history=history, delay=0)
-        self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
+        self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}" --recompute --no-skipping --markdown --batchsize 8', os.getcwd(), timeout=3600)
        res = glob.glob(os.path.join(dst,'*.mmd'))
        if len(res) == 0:
            self.threadLock.release()
--- a/crazy_functions/批量翻译PDF文档_NOUGAT.py
+++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py
@ -57,6 +57,12 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
        "批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    # 清空历史，以免输入溢出
    history = []
    from .crazy_utils import get_files_from_everything
    success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
    if len(file_manifest) > 0:
        # 尝试导入依赖，如果缺少依赖，则给出安装建议
        try:
            import nougat
@ -67,12 +73,11 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
                             b=f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade nougat-ocr tiktoken```。")
            yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
            return
-
+    success_mmd, file_manifest_mmd, _ = get_files_from_everything(txt, type='.mmd')
-    # 清空历史，以免输入溢出
+    success = success or success_mmd
-    history = []
+    file_manifest += file_manifest_mmd
-
+    chatbot.append(["文件列表：", ", ".join([e.split('/')[-1] for e in file_manifest])]); 
-    from .crazy_utils import get_files_from_everything
+    yield from update_ui(      chatbot=chatbot, history=history) 
    success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
    # 检测输入参数，如没有给定输入参数，直接退出
    if not success:
        if txt == "": txt = '空空如也的输入栏'
@ -101,9 +106,13 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
    from crazy_functions.pdf_fns.report_gen_html import construct_html
    nougat_handle = nougat_interface()
    for index, fp in enumerate(file_manifest):
        if fp.endswith('pdf'):
            chatbot.append(["当前进度：", f"正在解析论文，请稍候。（第一次运行时，需要花费较长时间下载NOUGAT参数）"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
            fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history)
            promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot)
        else:
            chatbot.append(["当前论文无需解析：", fp]); yield from update_ui(      chatbot=chatbot, history=history)
            fpp = fp
        with open(fpp, 'r', encoding='utf8') as f:
            article_content = f.readlines()
        article_dict = markdown_to_dict(article_content)