From 33bf795c663587ec86d5a852a14a7560b1af09a5 Mon Sep 17 00:00:00 2001 From: xiangsam Date: Fri, 10 Nov 2023 11:45:47 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=B2=BE=E5=87=86?= =?UTF-8?q?=E7=BF=BB=E8=AF=91PDF=E6=96=87=E6=A1=A3(NOUGAT)=E6=8F=92?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/crazy_utils.py | 2 +- crazy_functions/批量翻译PDF文档_NOUGAT.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index a23c732..ce7a2e3 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -748,7 +748,7 @@ class nougat_interface(): yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)", chatbot=chatbot, history=history, delay=0) - self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600) + self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}" --recompute --no-skipping --markdown --batchsize 8', os.getcwd(), timeout=3600) res = glob.glob(os.path.join(dst,'*.mmd')) if len(res) == 0: self.threadLock.release() diff --git a/crazy_functions/批量翻译PDF文档_NOUGAT.py b/crazy_functions/批量翻译PDF文档_NOUGAT.py index 3e50c93..50e34c4 100644 --- a/crazy_functions/批量翻译PDF文档_NOUGAT.py +++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py @@ -73,6 +73,11 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst from .crazy_utils import get_files_from_everything success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf') + success_mmd, file_manifest_mmd, _ = get_files_from_everything(txt, type='.mmd') + success = success or success_mmd + file_manifest += file_manifest_mmd + chatbot.append(["文件列表:", ", ".join([e.split('/')[-1] for e in file_manifest])]); + yield from update_ui( chatbot=chatbot, history=history) # 检测输入参数,如没有给定输入参数,直接退出 if not success: if txt == "": txt = '空空如也的输入栏' @@ -101,9 +106,13 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa from crazy_functions.pdf_fns.report_gen_html import construct_html nougat_handle = nougat_interface() for index, fp in enumerate(file_manifest): - chatbot.append(["当前进度:", f"正在解析论文,请稍候。(第一次运行时,需要花费较长时间下载NOUGAT参数)"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history) - promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot) + if fp.endswith('pdf'): + chatbot.append(["当前进度:", f"正在解析论文,请稍候。(第一次运行时,需要花费较长时间下载NOUGAT参数)"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history) + promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot) + else: + chatbot.append(["当前论文无需解析:", fp]); yield from update_ui( chatbot=chatbot, history=history) + fpp = fp with open(fpp, 'r', encoding='utf8') as f: article_content = f.readlines() article_dict = markdown_to_dict(article_content) From 362b545a45352b011adef023e54f9c34a8110fdf Mon Sep 17 00:00:00 2001 From: xiangsam Date: Fri, 10 Nov 2023 14:25:37 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=94=B9import=20nougat=E6=97=B6?= =?UTF-8?q?=E6=9C=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/批量翻译PDF文档_NOUGAT.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/crazy_functions/批量翻译PDF文档_NOUGAT.py b/crazy_functions/批量翻译PDF文档_NOUGAT.py index 50e34c4..16dfd6b 100644 --- a/crazy_functions/批量翻译PDF文档_NOUGAT.py +++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py @@ -57,22 +57,22 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst "批量翻译PDF文档。函数插件贡献者: Binary-Husky"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import nougat - import tiktoken - except: - report_execption(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade nougat-ocr tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - # 清空历史,以免输入溢出 history = [] from .crazy_utils import get_files_from_everything success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf') + if len(file_manifest) > 0: + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import nougat + import tiktoken + except: + report_execption(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade nougat-ocr tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return success_mmd, file_manifest_mmd, _ = get_files_from_everything(txt, type='.mmd') success = success or success_mmd file_manifest += file_manifest_mmd