introduce doc2x

2024-04-15 01:57:31 +08:00 · 2024-04-15 01:57:31 +08:00 · 160552cc5f
commit 160552cc5f
parent c131ec0b20
7 changed files with 208 additions and 10 deletions
--- a/config.py
+++ b/config.py
@ -223,6 +223,10 @@ MATHPIX_APPID = ""
 MATHPIX_APPKEY = ""


+# Mathpix 拥有执行PDF的OCR功能，但是需要注册账号
+DOC2X_API_KEY = ""
+
+
 # 自定义API KEY格式
 CUSTOM_API_KEY_PATTERN = ""

--- a/crazy_functions/PDF批量翻译.py
+++ b/crazy_functions/PDF批量翻译.py
@ -1,6 +1,7 @@
 from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages
 from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
-from toolbox import write_history_to_file, promote_file_to_downloadzone
+from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive
+from toolbox import get_upload_folder, zip_folder
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 from .crazy_utils import read_and_clean_pdf_text
@ -46,14 +47,123 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
        return

    # 开始正式执行任务
+    DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
+    if len(DOC2X_API_KEY) != 0:
+        yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
+        return
    grobid_url = get_avail_grobid_url()
    if grobid_url is not None:
        yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
+        return
    else:
        yield from update_ui_lastest_msg("GROBID服务不可用，请检查config中的GROBID_URL。作为替代，现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
        yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
+        return


+
+def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
+
+    def pdf2markdown(filepath):
+        import requests, json, os
+        markdown_dir = get_log_folder(plugin_name="pdf_ocr")
+        doc2x_api_key = DOC2X_API_KEY
+        url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
+
+        chatbot.append((None, "加载PDF文件，发送至DOC2X解析..."))
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+        res = requests.post(
+            url,
+            files={"file": open(filepath, "rb")},
+            data={"ocr": "1"},
+            headers={"Authorization": "Bearer " + doc2x_api_key}
+        )
+        res_json = []
+        if res.status_code == 200:
+            decoded = res.content.decode("utf-8")
+            for z_decoded in decoded.split('\n'):
+                if len(z_decoded) == 0: continue
+                assert z_decoded.startswith("data: ")
+                z_decoded = z_decoded[len("data: "):]
+                decoded_json = json.loads(z_decoded)
+                res_json.append(decoded_json)
+        else:
+            raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+        uuid = res_json[0]['uuid']
+        to = "md" # latex, md, docx
+        url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
+
+        chatbot.append((None, f"读取解析: {url} ..."))
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+        res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
+        md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip')
+        if res.status_code == 200:
+            with open(md_zip_path, "wb") as f: f.write(res.content)
+        else:
+            raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+        promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
+        chatbot.append((None, f"完成解析 {md_zip_path} ..."))
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        return md_zip_path
+
+    def deliver_to_markdown_plugin(md_zip_path, user_request):
+        from crazy_functions.批量Markdown翻译 import Markdown英译中
+        import shutil
+
+        time_tag = gen_time_str()
+        target_path_base = get_log_folder(chatbot.get_user())
+        file_origin_name = os.path.basename(md_zip_path)
+        this_file_path = os.path.join(target_path_base, file_origin_name)
+        os.makedirs(target_path_base, exist_ok=True)
+        shutil.copyfile(md_zip_path, this_file_path)
+        ex_folder = this_file_path + ".extract"
+        extract_archive(
+            file_path=this_file_path, dest_dir=ex_folder
+        )
+        chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
+        plugin_kwargs['markdown_expected_output_dir'] = ex_folder
+
+        translated_f_name = 'translated_markdown.md'
+        generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name)
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
+        if os.path.exists(generated_fp):
+            # 修正一些公式问题
+            with open(generated_fp, 'r', encoding='utf8') as f:
+                content = f.read()
+            # 将公式中的\[ \]替换成$$
+            content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
+            # 将公式中的\( \)替换成$
+            content = content.replace(r'\(', r'$').replace(r'\)', r'$')
+            content = content.replace('```', '\n').replace('```markdown', '\n')
+            with open(generated_fp, 'w', encoding='utf8') as f:
+                f.write(content)
+            # 生成包含图片的压缩包
+            dest_folder = get_log_folder(chatbot.get_user())
+            zip_name = '翻译后的带图文档.zip'
+            zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
+            zip_fp = os.path.join(dest_folder, zip_name)
+            # 生成在线预览html
+            file_name = '在线预览翻译' + gen_time_str() + '.html'
+            with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
+                html_template = f.read()
+            html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
+            preview_fp = os.path.join(ex_folder, file_name)
+            with open(preview_fp, 'w', encoding='utf8') as f:
+                f.write(html_template)
+            promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
+            promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
+            yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+    md_zip_path = yield from pdf2markdown(fp)
+    yield from deliver_to_markdown_plugin(md_zip_path, user_request)
+
+def 解析PDF_DOC2X(file_manifest, *args):
+    for index, fp in enumerate(file_manifest):
+        yield from 解析PDF_DOC2X_单文件(fp, *args)
+    return
+
 def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url):
    import copy, json
    TOKEN_LIMIT_PER_FRAGMENT = 1024
--- a/crazy_functions/pdf_fns/report_template_v2.html
+++ b/crazy_functions/pdf_fns/report_template_v2.html
@ -0,0 +1,73 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <title>GPT-Academic 翻译报告书</title>
+    <style>
+        .centered-a {
+            color: red;
+            text-align: center;
+            margin-bottom: 2%;
+            font-size: 1.5em;
+        }
+        .centered-b {
+            color: red;
+            text-align: center;
+            margin-top: 10%;
+            margin-bottom: 20%;
+            font-size: 1.5em;
+        }
+        .centered-c {
+            color: rgba(255, 0, 0, 0);
+            text-align: center;
+            margin-top: 2%;
+            margin-bottom: 20%;
+            font-size: 7em;
+        }
+    </style>
+<script>
+        // Configure MathJax settings
+        MathJax = {
+            tex: {
+                inlineMath: [
+                    ['$', '$'],
+                    ['\(', '\)']
+                ]
+            }
+        }
+        addEventListener('zero-md-rendered', () => {MathJax.typeset(); console.log('MathJax typeset!');})
+    </script>
+    <!-- Load MathJax library -->
+    <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
+    <script
+        type="module"
+        src="https://cdn.jsdelivr.net/gh/zerodevx/zero-md@2/dist/zero-md.min.js"
+    ></script>
+
+</head>
+
+<body>
+    <div class="test_temp1" style="width:10%; height: 500px; float:left;">
+
+    </div>
+    <div class="test_temp2" style="width:80%; height: 500px; float:left;">
+        <!-- Simply set the `src` attribute to your MD file and win -->
+        <div class="centered-a">
+            请按Ctrl+S保存此页面，否则该页面可能在几分钟后失效。
+        </div>
+        <zero-md src="translated_markdown.md" no-shadow>
+        </zero-md>
+        <div class="centered-b">
+            本报告由GPT-Academic开源项目生成，地址：https://github.com/binary-husky/gpt_academic。
+        </div>
+        <div class="centered-c">
+            本报告由GPT-Academic开源项目生成，地址：https://github.com/binary-husky/gpt_academic。
+        </div>
+    </div>
+    <div class="test_temp3" style="width:10%; height: 500px; float:left;">
+    </div>
+
+    </body>
+
+</html>
--- a/crazy_functions/批量Markdown翻译.py
+++ b/crazy_functions/批量Markdown翻译.py
@ -1,4 +1,4 @@
-import glob, time, os, re, logging
+import glob, shutil, os, re, logging
 from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
 from toolbox import CatchException, report_exception, get_log_folder
 from toolbox import write_history_to_file, promote_file_to_downloadzone
@ -69,17 +69,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch

    #  <-------- 多线程翻译开始 ---------->
    if language == 'en->zh':
-        inputs_array = ["This is a Markdown file, translate it into Chinese, do not modify any existing Markdown commands:" +
+        inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" +
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
        sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
    elif language == 'zh->en':
-        inputs_array = [f"This is a Markdown file, translate it into English, do not modify any existing Markdown commands:" +
+        inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" +
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
        sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
    else:
-        inputs_array = [f"This is a Markdown file, translate it into {language}, do not modify any existing Markdown commands, only answer me with translated results:" +
+        inputs_array = [f"This is a Markdown file, translate it into {language}, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
        sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
@ -99,7 +99,12 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
        for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]):
            pfg.sp_file_result.append(gpt_say)
        pfg.merge_result()
-        pfg.write_result(language)
+        output_file_arr = pfg.write_result(language)
+        for output_file in output_file_arr:
+            promote_file_to_downloadzone(output_file, chatbot=chatbot)
+            if 'markdown_expected_output_path' in plugin_kwargs:
+                expected_f_name = plugin_kwargs['markdown_expected_output_path']
+                shutil.copyfile(output_file, expected_f_name)
    except:
        logging.error(trimmed_format_exc())

--- a/shared_utils/text_mask.py
+++ b/shared_utils/text_mask.py
@ -26,6 +26,8 @@ def apply_gpt_academic_string_mask(string, mode="show_all"):
    当字符串中有掩码tag时（<gpt_academic_string_mask><show_...>），根据字符串要给谁看（大模型，还是web渲染），对字符串进行处理，返回处理后的字符串
    示意图：https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
    """
+    if not string:
+        return string
    if "<gpt_academic_string_mask>" not in string: # No need to process
        return string

--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@ -22,10 +22,12 @@ if __name__ == "__main__":

    # plugin_test(plugin='crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF', main_input="2307.07522")

-    plugin_test(
-        plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF",
-        main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix",
-    )
+    plugin_test(plugin='crazy_functions.PDF批量翻译->批量翻译PDF文档', main_input='build/pdf/t1.pdf')
+
+    # plugin_test(
+    #     plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF",
+    #     main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix",
+    # )

    # plugin_test(plugin='crazy_functions.虚空终端->虚空终端', main_input='修改api-key为sk-jhoejriotherjep')

--- a/toolbox.py
+++ b/toolbox.py
@ -79,6 +79,8 @@ class ChatBotWithCookies(list):
    def get_cookies(self):
        return self._cookies

+    def get_user(self):
+        return self._cookies.get("user_name", default_user_name)

 def ArgsGeneralWrapper(f):
    """