From 6ad15a61299451d688913a0c917d1cca52d6d691 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 22 Apr 2024 01:54:03 +0800
Subject: [PATCH] fix equation showing problem

---
 crazy_functions/PDF批量翻译.py           | 82 +++++++++++++++---------
 crazy_functions/批量Markdown翻译.py      | 13 ++--
 shared_utils/advanced_markdown_format.py | 34 ++++++++++
 tests/test_markdown.py                   |  6 +-
 4 files changed, 96 insertions(+), 39 deletions(-)

diff --git a/crazy_functions/PDF批量翻译.py b/crazy_functions/PDF批量翻译.py
index 630d595..dddc8f2 100644
--- a/crazy_functions/PDF批量翻译.py
+++ b/crazy_functions/PDF批量翻译.py
@@ -5,6 +5,7 @@ from toolbox import get_upload_folder, zip_folder
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 from .crazy_utils import read_and_clean_pdf_text
+from .crazy_utils import get_files_from_everything
 from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
 from colorful import *
 import os
@@ -15,9 +16,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
 
     disable_auto_promotion(chatbot)
     # 基本信息：功能、贡献者
-    chatbot.append([
-        "函数插件功能？",
-        "批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
+    chatbot.append([None, "插件功能：批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 
     # 尝试导入依赖，如果缺少依赖，则给出安装建议
@@ -33,7 +32,6 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
     # 清空历史，以免输入溢出
     history = []
 
-    from .crazy_utils import get_files_from_everything
     success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
     # 检测输入参数，如没有给定输入参数，直接退出
     if not success:
@@ -48,17 +46,25 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
 
     # 开始正式执行任务
     DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
+    # ------- 第一种方法，效果最好，但是需要DOC2X服务 -------
     if len(DOC2X_API_KEY) != 0:
-        yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
-        return
+        try:
+            yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
+            return
+        except:
+            chatbot.append([None, "DOC2X服务不可用，现在将执行效果稍差的旧版代码。"])
+            yield from update_ui(chatbot=chatbot, history=history)
+
+    # ------- 第二种方法，效果次优 -------
     grobid_url = get_avail_grobid_url()
     if grobid_url is not None:
         yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
         return
-    else:
-        yield from update_ui_lastest_msg("GROBID服务不可用，请检查config中的GROBID_URL。作为替代，现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
-        yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
-        return
+
+    # ------- 第三种方法，早期代码，效果不理想 -------
+    yield from update_ui_lastest_msg("GROBID服务不可用，请检查config中的GROBID_URL。作为替代，现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
+    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
+    return
 
 
 
@@ -110,7 +116,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
 
     def deliver_to_markdown_plugin(md_zip_path, user_request):
         from crazy_functions.批量Markdown翻译 import Markdown英译中
-        import shutil
+        import shutil, re
 
         time_tag = gen_time_str()
         target_path_base = get_log_folder(chatbot.get_user())
@@ -122,6 +128,23 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
         extract_archive(
             file_path=this_file_path, dest_dir=ex_folder
         )
+
+        # edit markdown files
+        success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md')
+        for generated_fp in file_manifest:
+            # 修正一些公式问题
+            with open(generated_fp, 'r', encoding='utf8') as f:
+                content = f.read()
+            # 将公式中的\[ \]替换成$$
+            content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
+            # 将公式中的\( \)替换成$
+            content = content.replace(r'\(', r'$').replace(r'\)', r'$')
+            content = content.replace('```markdown', '\n').replace('```', '\n')
+            with open(generated_fp, 'w', encoding='utf8') as f:
+                f.write(content)
+            promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
+            yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
         chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
         plugin_kwargs['markdown_expected_output_dir'] = ex_folder
 
@@ -131,29 +154,30 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
         yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
         if os.path.exists(generated_fp):
             # 修正一些公式问题
-            with open(generated_fp, 'r', encoding='utf8') as f:
-                content = f.read()
-            # 将公式中的\[ \]替换成$$
-            content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
-            # 将公式中的\( \)替换成$
-            content = content.replace(r'\(', r'$').replace(r'\)', r'$')
-            content = content.replace('```', '\n').replace('```markdown', '\n')
-            with open(generated_fp, 'w', encoding='utf8') as f:
-                f.write(content)
+            with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
+            content = content.replace('```markdown', '\n').replace('```', '\n')
+            with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
+            # 生成在线预览html
+            file_name = '在线预览翻译' + gen_time_str() + '.html'
+            # with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
+            #     html_template = f.read()
+            # html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
+            preview_fp = os.path.join(ex_folder, file_name)
+            # with open(preview_fp, 'w', encoding='utf8') as f:
+            #     f.write(html_template)
+            # 生成在线预览html
+            from shared_utils.advanced_markdown_format import markdown_convertion_for_file
+            with open(generated_fp, "r", encoding="utf-8") as f:
+                md = f.read()
+            html = markdown_convertion_for_file(md)
+            # print(html)
+            with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
+            promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
             # 生成包含图片的压缩包
             dest_folder = get_log_folder(chatbot.get_user())
             zip_name = '翻译后的带图文档.zip'
             zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
             zip_fp = os.path.join(dest_folder, zip_name)
-            # 生成在线预览html
-            file_name = '在线预览翻译' + gen_time_str() + '.html'
-            with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
-                html_template = f.read()
-            html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
-            preview_fp = os.path.join(ex_folder, file_name)
-            with open(preview_fp, 'w', encoding='utf8') as f:
-                f.write(html_template)
-            promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
             promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
             yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
     md_zip_path = yield from pdf2markdown(fp)
diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/批量Markdown翻译.py
index 9a0be94..61dd4b5 100644
--- a/crazy_functions/批量Markdown翻译.py
+++ b/crazy_functions/批量Markdown翻译.py
@@ -1,5 +1,5 @@
 import glob, shutil, os, re, logging
-from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
+from toolbox import update_ui, trimmed_format_exc, gen_time_str
 from toolbox import CatchException, report_exception, get_log_folder
 from toolbox import write_history_to_file, promote_file_to_downloadzone
 fast_debug = False
@@ -18,7 +18,7 @@ class PaperFileGroup():
         def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         self.get_token_num = get_token_num
 
-    def run_file_split(self, max_token_limit=1900):
+    def run_file_split(self, max_token_limit=2048):
         """
         将长文本分离开来
         """
@@ -64,17 +64,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
             pfg.file_contents.append(file_content)
 
     #  <-------- 拆分过长的Markdown文件 ---------->
-    pfg.run_file_split(max_token_limit=1500)
+    pfg.run_file_split(max_token_limit=2048)
     n_split = len(pfg.sp_file_contents)
 
     #  <-------- 多线程翻译开始 ---------->
     if language == 'en->zh':
-        inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" +
+        inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
                         f"\n\n{frag}" for frag in pfg.sp_file_contents]
         inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
         sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
     elif language == 'zh->en':
-        inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" +
+        inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
                         f"\n\n{frag}" for frag in pfg.sp_file_contents]
         inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
         sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
@@ -164,7 +164,6 @@ def Markdown英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
         "函数插件功能？",
         "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)
 
     # 尝试导入依赖，如果缺少依赖，则给出安装建议
     try:
@@ -204,7 +203,6 @@ def Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
         "函数插件功能？",
         "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)
 
     # 尝试导入依赖，如果缺少依赖，则给出安装建议
     try:
@@ -237,7 +235,6 @@ def Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history,
         "函数插件功能？",
         "对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    disable_auto_promotion(chatbot)
 
     # 尝试导入依赖，如果缺少依赖，则给出安装建议
     try:
diff --git a/shared_utils/advanced_markdown_format.py b/shared_utils/advanced_markdown_format.py
index a015fd6..ce60f88 100644
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@@ -207,6 +207,40 @@ def fix_code_segment_indent(txt):
         return txt
 
 
+def markdown_convertion_for_file(txt):
+    """
+    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
+    """
+    pre = '<div class="markdown-body">'
+    suf = "</div>"
+    if txt.startswith(pre) and txt.endswith(suf):
+        # print('警告，输入了已经经过转化的字符串，二次转化可能出问题')
+        return txt  # 已经被转化过，不需要再次转化
+
+    find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
+    txt = fix_markdown_indent(txt)
+    # convert everything to html format
+    split = markdown.markdown(text="---")
+    convert_stage_1 = markdown.markdown(
+        text=txt,
+        extensions=[
+            "sane_lists",
+            "tables",
+            "mdx_math",
+            "pymdownx.superfences",
+            "pymdownx.highlight",
+        ],
+        extension_configs={**markdown_extension_configs, **code_highlight_configs},
+    )
+    convert_stage_1 = markdown_bug_hunt(convert_stage_1)
+
+    # 2. convert to rendered equation
+    convert_stage_2_2, n = re.subn(
+        find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+    )
+    # cat them together
+    return pre + convert_stage_2_2 + suf
+
 @lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):
     """
diff --git a/tests/test_markdown.py b/tests/test_markdown.py
index 2089c60..f448404 100644
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@@ -43,8 +43,10 @@ def validate_path():
 
 validate_path()  # validate path so you can run from base directory
 from toolbox import markdown_convertion
-
-html = markdown_convertion(md)
+from shared_utils.advanced_markdown_format import markdown_convertion_for_file
+with open("gpt_log/default_user/shared/2024-04-22-01-27-43.zip.extract/translated_markdown.md", "r", encoding="utf-8") as f:
+    md = f.read()
+html = markdown_convertion_for_file(md)
 # print(html)
 with open("test.html", "w", encoding="utf-8") as f:
     f.write(html)