fix equation showing problem

2024-04-22 01:54:03 +08:00 · 2024-04-22 01:54:03 +08:00 · 6ad15a6129
commit 6ad15a6129
parent 09990d44d3
4 changed files with 96 additions and 39 deletions
--- a/crazy_functions/PDF批量翻译.py
+++ b/crazy_functions/PDF批量翻译.py
@ -5,6 +5,7 @@ from toolbox import get_upload_folder, zip_folder
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 from .crazy_utils import read_and_clean_pdf_text
+from .crazy_utils import get_files_from_everything
 from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
 from colorful import *
 import os
@ -15,9 +16,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst

    disable_auto_promotion(chatbot)
    # 基本信息：功能、贡献者
-    chatbot.append([
-        "函数插件功能？",
-        "批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
+    chatbot.append([None, "插件功能：批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 尝试导入依赖，如果缺少依赖，则给出安装建议
@ -33,7 +32,6 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
    # 清空历史，以免输入溢出
    history = []

-    from .crazy_utils import get_files_from_everything
    success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
    # 检测输入参数，如没有给定输入参数，直接退出
    if not success:
@ -48,17 +46,25 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst

    # 开始正式执行任务
    DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
+    # ------- 第一种方法，效果最好，但是需要DOC2X服务 -------
    if len(DOC2X_API_KEY) != 0:
-        yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
-        return
+        try:
+            yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
+            return
+        except:
+            chatbot.append([None, "DOC2X服务不可用，现在将执行效果稍差的旧版代码。"])
+            yield from update_ui(chatbot=chatbot, history=history)
+
+    # ------- 第二种方法，效果次优 -------
    grobid_url = get_avail_grobid_url()
    if grobid_url is not None:
        yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
        return
-    else:
-        yield from update_ui_lastest_msg("GROBID服务不可用，请检查config中的GROBID_URL。作为替代，现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
-        yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
-        return
+
+    # ------- 第三种方法，早期代码，效果不理想 -------
+    yield from update_ui_lastest_msg("GROBID服务不可用，请检查config中的GROBID_URL。作为替代，现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
+    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
+    return



@ -110,7 +116,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha

    def deliver_to_markdown_plugin(md_zip_path, user_request):
        from crazy_functions.批量Markdown翻译 import Markdown英译中
-        import shutil
+        import shutil, re

        time_tag = gen_time_str()
        target_path_base = get_log_folder(chatbot.get_user())
@ -122,6 +128,23 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
        extract_archive(
            file_path=this_file_path, dest_dir=ex_folder
        )
+
+        # edit markdown files
+        success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md')
+        for generated_fp in file_manifest:
+            # 修正一些公式问题
+            with open(generated_fp, 'r', encoding='utf8') as f:
+                content = f.read()
+            # 将公式中的\[ \]替换成$$
+            content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
+            # 将公式中的\( \)替换成$
+            content = content.replace(r'\(', r'$').replace(r'\)', r'$')
+            content = content.replace('```markdown', '\n').replace('```', '\n')
+            with open(generated_fp, 'w', encoding='utf8') as f:
+                f.write(content)
+            promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
+            yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
        chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
        plugin_kwargs['markdown_expected_output_dir'] = ex_folder

@ -131,29 +154,30 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
        yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
        if os.path.exists(generated_fp):
            # 修正一些公式问题
-            with open(generated_fp, 'r', encoding='utf8') as f:
-                content = f.read()
-            # 将公式中的\[ \]替换成$$
-            content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
-            # 将公式中的\( \)替换成$
-            content = content.replace(r'\(', r'$').replace(r'\)', r'$')
-            content = content.replace('```', '\n').replace('```markdown', '\n')
-            with open(generated_fp, 'w', encoding='utf8') as f:
-                f.write(content)
+            with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
+            content = content.replace('```markdown', '\n').replace('```', '\n')
+            with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
+            # 生成在线预览html
+            file_name = '在线预览翻译' + gen_time_str() + '.html'
+            # with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
+            #     html_template = f.read()
+            # html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
+            preview_fp = os.path.join(ex_folder, file_name)
+            # with open(preview_fp, 'w', encoding='utf8') as f:
+            #     f.write(html_template)
+            # 生成在线预览html
+            from shared_utils.advanced_markdown_format import markdown_convertion_for_file
+            with open(generated_fp, "r", encoding="utf-8") as f:
+                md = f.read()
+            html = markdown_convertion_for_file(md)
+            # print(html)
+            with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
+            promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
            # 生成包含图片的压缩包
            dest_folder = get_log_folder(chatbot.get_user())
            zip_name = '翻译后的带图文档.zip'
            zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
            zip_fp = os.path.join(dest_folder, zip_name)
-            # 生成在线预览html
-            file_name = '在线预览翻译' + gen_time_str() + '.html'
-            with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
-                html_template = f.read()
-            html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
-            preview_fp = os.path.join(ex_folder, file_name)
-            with open(preview_fp, 'w', encoding='utf8') as f:
-                f.write(html_template)
-            promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
            promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
            yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    md_zip_path = yield from pdf2markdown(fp)
--- a/crazy_functions/批量Markdown翻译.py
+++ b/crazy_functions/批量Markdown翻译.py
@ -1,5 +1,5 @@
 import glob, shutil, os, re, logging
-from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
+from toolbox import update_ui, trimmed_format_exc, gen_time_str
 from toolbox import CatchException, report_exception, get_log_folder
 from toolbox import write_history_to_file, promote_file_to_downloadzone
 fast_debug = False
@ -18,7 +18,7 @@ class PaperFileGroup():
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
        self.get_token_num = get_token_num

-    def run_file_split(self, max_token_limit=1900):
+    def run_file_split(self, max_token_limit=2048):
        """
        将长文本分离开来
        """
@ -64,17 +64,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
            pfg.file_contents.append(file_content)

    #  <-------- 拆分过长的Markdown文件 ---------->
-    pfg.run_file_split(max_token_limit=1500)
+    pfg.run_file_split(max_token_limit=2048)
    n_split = len(pfg.sp_file_contents)

    #  <-------- 多线程翻译开始 ---------->
    if language == 'en->zh':
-        inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" +
+        inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
        sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
    elif language == 'zh->en':
-        inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" +
+        inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
        sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
@ -164,7 +164,6 @@ def Markdown英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
        "函数插件功能？",
        "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)

    # 尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
@ -204,7 +203,6 @@ def Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
        "函数插件功能？",
        "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)

    # 尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
@ -237,7 +235,6 @@ def Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history,
        "函数插件功能？",
        "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)

    # 尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@ -207,6 +207,40 @@ def fix_code_segment_indent(txt):
        return txt


+def markdown_convertion_for_file(txt):
+    """
+    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
+    """
+    pre = '<div class="markdown-body">'
+    suf = "</div>"
+    if txt.startswith(pre) and txt.endswith(suf):
+        # print('警告，输入了已经经过转化的字符串，二次转化可能出问题')
+        return txt  # 已经被转化过，不需要再次转化
+
+    find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
+    txt = fix_markdown_indent(txt)
+    # convert everything to html format
+    split = markdown.markdown(text="---")
+    convert_stage_1 = markdown.markdown(
+        text=txt,
+        extensions=[
+            "sane_lists",
+            "tables",
+            "mdx_math",
+            "pymdownx.superfences",
+            "pymdownx.highlight",
+        ],
+        extension_configs={**markdown_extension_configs, **code_highlight_configs},
+    )
+    convert_stage_1 = markdown_bug_hunt(convert_stage_1)
+
+    # 2. convert to rendered equation
+    convert_stage_2_2, n = re.subn(
+        find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+    )
+    # cat them together
+    return pre + convert_stage_2_2 + suf
+
@lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):
    """
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@ -43,8 +43,10 @@ def validate_path():

 validate_path()  # validate path so you can run from base directory
 from toolbox import markdown_convertion
-
-html = markdown_convertion(md)
+from shared_utils.advanced_markdown_format import markdown_convertion_for_file
+with open("gpt_log/default_user/shared/2024-04-22-01-27-43.zip.extract/translated_markdown.md", "r", encoding="utf-8") as f:
+    md = f.read()
+html = markdown_convertion_for_file(md)
 # print(html)
 with open("test.html", "w", encoding="utf-8") as f:
    f.write(html)