From 160552cc5fcc16413db3dfa3176b3fb5ab613c5a Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 15 Apr 2024 01:57:31 +0800 Subject: [PATCH] introduce doc2x --- config.py | 4 + crazy_functions/PDF批量翻译.py | 112 +++++++++++++++++- .../pdf_fns/report_template_v2.html | 73 ++++++++++++ crazy_functions/批量Markdown翻译.py | 15 ++- shared_utils/text_mask.py | 2 + tests/test_plugins.py | 10 +- toolbox.py | 2 + 7 files changed, 208 insertions(+), 10 deletions(-) create mode 100644 crazy_functions/pdf_fns/report_template_v2.html diff --git a/config.py b/config.py index 380a64b..44788bc 100644 --- a/config.py +++ b/config.py @@ -223,6 +223,10 @@ MATHPIX_APPID = "" MATHPIX_APPKEY = "" +# Mathpix 拥有执行PDF的OCR功能,但是需要注册账号 +DOC2X_API_KEY = "" + + # 自定义API KEY格式 CUSTOM_API_KEY_PATTERN = "" diff --git a/crazy_functions/PDF批量翻译.py b/crazy_functions/PDF批量翻译.py index 7d6ad4f..630d595 100644 --- a/crazy_functions/PDF批量翻译.py +++ b/crazy_functions/PDF批量翻译.py @@ -1,6 +1,7 @@ from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion -from toolbox import write_history_to_file, promote_file_to_downloadzone +from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive +from toolbox import get_upload_folder, zip_folder from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency from .crazy_utils import read_and_clean_pdf_text @@ -46,14 +47,123 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst return # 开始正式执行任务 + DOC2X_API_KEY = get_conf("DOC2X_API_KEY") + if len(DOC2X_API_KEY) != 0: + yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) + return grobid_url = get_avail_grobid_url() if grobid_url is not None: yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url) + return else: yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3) yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + return + +def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): + + def pdf2markdown(filepath): + import requests, json, os + markdown_dir = get_log_folder(plugin_name="pdf_ocr") + doc2x_api_key = DOC2X_API_KEY + url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + + chatbot.append((None, "加载PDF文件,发送至DOC2X解析...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + res = requests.post( + url, + files={"file": open(filepath, "rb")}, + data={"ocr": "1"}, + headers={"Authorization": "Bearer " + doc2x_api_key} + ) + res_json = [] + if res.status_code == 200: + decoded = res.content.decode("utf-8") + for z_decoded in decoded.split('\n'): + if len(z_decoded) == 0: continue + assert z_decoded.startswith("data: ") + z_decoded = z_decoded[len("data: "):] + decoded_json = json.loads(z_decoded) + res_json.append(decoded_json) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + uuid = res_json[0]['uuid'] + to = "md" # latex, md, docx + url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to + + chatbot.append((None, f"读取解析: {url} ...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) + md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip') + if res.status_code == 200: + with open(md_zip_path, "wb") as f: f.write(res.content) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) + chatbot.append((None, f"完成解析 {md_zip_path} ...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return md_zip_path + + def deliver_to_markdown_plugin(md_zip_path, user_request): + from crazy_functions.批量Markdown翻译 import Markdown英译中 + import shutil + + time_tag = gen_time_str() + target_path_base = get_log_folder(chatbot.get_user()) + file_origin_name = os.path.basename(md_zip_path) + this_file_path = os.path.join(target_path_base, file_origin_name) + os.makedirs(target_path_base, exist_ok=True) + shutil.copyfile(md_zip_path, this_file_path) + ex_folder = this_file_path + ".extract" + extract_archive( + file_path=this_file_path, dest_dir=ex_folder + ) + chatbot.append((None, f"调用Markdown插件 {ex_folder} ...")) + plugin_kwargs['markdown_expected_output_dir'] = ex_folder + + translated_f_name = 'translated_markdown.md' + generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) + if os.path.exists(generated_fp): + # 修正一些公式问题 + with open(generated_fp, 'r', encoding='utf8') as f: + content = f.read() + # 将公式中的\[ \]替换成$$ + content = content.replace(r'\[', r'$$').replace(r'\]', r'$$') + # 将公式中的\( \)替换成$ + content = content.replace(r'\(', r'$').replace(r'\)', r'$') + content = content.replace('```', '\n').replace('```markdown', '\n') + with open(generated_fp, 'w', encoding='utf8') as f: + f.write(content) + # 生成包含图片的压缩包 + dest_folder = get_log_folder(chatbot.get_user()) + zip_name = '翻译后的带图文档.zip' + zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name) + zip_fp = os.path.join(dest_folder, zip_name) + # 生成在线预览html + file_name = '在线预览翻译' + gen_time_str() + '.html' + with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f: + html_template = f.read() + html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name) + preview_fp = os.path.join(ex_folder, file_name) + with open(preview_fp, 'w', encoding='utf8') as f: + f.write(html_template) + promote_file_to_downloadzone(preview_fp, chatbot=chatbot) + promote_file_to_downloadzone(zip_fp, chatbot=chatbot) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + md_zip_path = yield from pdf2markdown(fp) + yield from deliver_to_markdown_plugin(md_zip_path, user_request) + +def 解析PDF_DOC2X(file_manifest, *args): + for index, fp in enumerate(file_manifest): + yield from 解析PDF_DOC2X_单文件(fp, *args) + return + def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url): import copy, json TOKEN_LIMIT_PER_FRAGMENT = 1024 diff --git a/crazy_functions/pdf_fns/report_template_v2.html b/crazy_functions/pdf_fns/report_template_v2.html new file mode 100644 index 0000000..a18c9ec --- /dev/null +++ b/crazy_functions/pdf_fns/report_template_v2.html @@ -0,0 +1,73 @@ + + + + + + GPT-Academic 翻译报告书 + + + + + + + + + +
+ +
+
+ +
+ 请按Ctrl+S保存此页面,否则该页面可能在几分钟后失效。 +
+ + +
+ 本报告由GPT-Academic开源项目生成,地址:https://github.com/binary-husky/gpt_academic。 +
+
+ 本报告由GPT-Academic开源项目生成,地址:https://github.com/binary-husky/gpt_academic。 +
+
+
+
+ + + + \ No newline at end of file diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/批量Markdown翻译.py index 7b87589..9a0be94 100644 --- a/crazy_functions/批量Markdown翻译.py +++ b/crazy_functions/批量Markdown翻译.py @@ -1,4 +1,4 @@ -import glob, time, os, re, logging +import glob, shutil, os, re, logging from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion from toolbox import CatchException, report_exception, get_log_folder from toolbox import write_history_to_file, promote_file_to_downloadzone @@ -69,17 +69,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch # <-------- 多线程翻译开始 ----------> if language == 'en->zh': - inputs_array = ["This is a Markdown file, translate it into Chinese, do not modify any existing Markdown commands:" + + inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] elif language == 'zh->en': - inputs_array = [f"This is a Markdown file, translate it into English, do not modify any existing Markdown commands:" + + inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] else: - inputs_array = [f"This is a Markdown file, translate it into {language}, do not modify any existing Markdown commands, only answer me with translated results:" + + inputs_array = [f"This is a Markdown file, translate it into {language}, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] @@ -99,7 +99,12 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]): pfg.sp_file_result.append(gpt_say) pfg.merge_result() - pfg.write_result(language) + output_file_arr = pfg.write_result(language) + for output_file in output_file_arr: + promote_file_to_downloadzone(output_file, chatbot=chatbot) + if 'markdown_expected_output_path' in plugin_kwargs: + expected_f_name = plugin_kwargs['markdown_expected_output_path'] + shutil.copyfile(output_file, expected_f_name) except: logging.error(trimmed_format_exc()) diff --git a/shared_utils/text_mask.py b/shared_utils/text_mask.py index 4ecb130..2645ac8 100644 --- a/shared_utils/text_mask.py +++ b/shared_utils/text_mask.py @@ -26,6 +26,8 @@ def apply_gpt_academic_string_mask(string, mode="show_all"): 当字符串中有掩码tag时(),根据字符串要给谁看(大模型,还是web渲染),对字符串进行处理,返回处理后的字符串 示意图:https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg """ + if not string: + return string if "" not in string: # No need to process return string diff --git a/tests/test_plugins.py b/tests/test_plugins.py index d18ee0a..527c29d 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -22,10 +22,12 @@ if __name__ == "__main__": # plugin_test(plugin='crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF', main_input="2307.07522") - plugin_test( - plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF", - main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix", - ) + plugin_test(plugin='crazy_functions.PDF批量翻译->批量翻译PDF文档', main_input='build/pdf/t1.pdf') + + # plugin_test( + # plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF", + # main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix", + # ) # plugin_test(plugin='crazy_functions.虚空终端->虚空终端', main_input='修改api-key为sk-jhoejriotherjep') diff --git a/toolbox.py b/toolbox.py index b25984d..6cc62e7 100644 --- a/toolbox.py +++ b/toolbox.py @@ -79,6 +79,8 @@ class ChatBotWithCookies(list): def get_cookies(self): return self._cookies + def get_user(self): + return self._cookies.get("user_name", default_user_name) def ArgsGeneralWrapper(f): """