fix equation showing problem
This commit is contained in:
parent
09990d44d3
commit
6ad15a6129
@ -5,6 +5,7 @@ from toolbox import get_upload_folder, zip_folder
|
|||||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||||
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
||||||
from .crazy_utils import read_and_clean_pdf_text
|
from .crazy_utils import read_and_clean_pdf_text
|
||||||
|
from .crazy_utils import get_files_from_everything
|
||||||
from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
|
from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
|
||||||
from colorful import *
|
from colorful import *
|
||||||
import os
|
import os
|
||||||
@ -15,9 +16,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
|
|||||||
|
|
||||||
disable_auto_promotion(chatbot)
|
disable_auto_promotion(chatbot)
|
||||||
# 基本信息:功能、贡献者
|
# 基本信息:功能、贡献者
|
||||||
chatbot.append([
|
chatbot.append([None, "插件功能:批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
|
||||||
"函数插件功能?",
|
|
||||||
"批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
||||||
@ -33,7 +32,6 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
|
|||||||
# 清空历史,以免输入溢出
|
# 清空历史,以免输入溢出
|
||||||
history = []
|
history = []
|
||||||
|
|
||||||
from .crazy_utils import get_files_from_everything
|
|
||||||
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
|
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
|
||||||
# 检测输入参数,如没有给定输入参数,直接退出
|
# 检测输入参数,如没有给定输入参数,直接退出
|
||||||
if not success:
|
if not success:
|
||||||
@ -48,17 +46,25 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
|
|||||||
|
|
||||||
# 开始正式执行任务
|
# 开始正式执行任务
|
||||||
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
|
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
|
||||||
|
# ------- 第一种方法,效果最好,但是需要DOC2X服务 -------
|
||||||
if len(DOC2X_API_KEY) != 0:
|
if len(DOC2X_API_KEY) != 0:
|
||||||
yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
|
try:
|
||||||
return
|
yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
|
||||||
|
return
|
||||||
|
except:
|
||||||
|
chatbot.append([None, "DOC2X服务不可用,现在将执行效果稍差的旧版代码。"])
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
|
|
||||||
|
# ------- 第二种方法,效果次优 -------
|
||||||
grobid_url = get_avail_grobid_url()
|
grobid_url = get_avail_grobid_url()
|
||||||
if grobid_url is not None:
|
if grobid_url is not None:
|
||||||
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
|
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
|
||||||
return
|
return
|
||||||
else:
|
|
||||||
yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
|
# ------- 第三种方法,早期代码,效果不理想 -------
|
||||||
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
|
||||||
return
|
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -110,7 +116,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
|
|
||||||
def deliver_to_markdown_plugin(md_zip_path, user_request):
|
def deliver_to_markdown_plugin(md_zip_path, user_request):
|
||||||
from crazy_functions.批量Markdown翻译 import Markdown英译中
|
from crazy_functions.批量Markdown翻译 import Markdown英译中
|
||||||
import shutil
|
import shutil, re
|
||||||
|
|
||||||
time_tag = gen_time_str()
|
time_tag = gen_time_str()
|
||||||
target_path_base = get_log_folder(chatbot.get_user())
|
target_path_base = get_log_folder(chatbot.get_user())
|
||||||
@ -122,6 +128,23 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
extract_archive(
|
extract_archive(
|
||||||
file_path=this_file_path, dest_dir=ex_folder
|
file_path=this_file_path, dest_dir=ex_folder
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# edit markdown files
|
||||||
|
success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md')
|
||||||
|
for generated_fp in file_manifest:
|
||||||
|
# 修正一些公式问题
|
||||||
|
with open(generated_fp, 'r', encoding='utf8') as f:
|
||||||
|
content = f.read()
|
||||||
|
# 将公式中的\[ \]替换成$$
|
||||||
|
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
|
||||||
|
# 将公式中的\( \)替换成$
|
||||||
|
content = content.replace(r'\(', r'$').replace(r'\)', r'$')
|
||||||
|
content = content.replace('```markdown', '\n').replace('```', '\n')
|
||||||
|
with open(generated_fp, 'w', encoding='utf8') as f:
|
||||||
|
f.write(content)
|
||||||
|
promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
|
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
|
||||||
plugin_kwargs['markdown_expected_output_dir'] = ex_folder
|
plugin_kwargs['markdown_expected_output_dir'] = ex_folder
|
||||||
|
|
||||||
@ -131,29 +154,30 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
||||||
if os.path.exists(generated_fp):
|
if os.path.exists(generated_fp):
|
||||||
# 修正一些公式问题
|
# 修正一些公式问题
|
||||||
with open(generated_fp, 'r', encoding='utf8') as f:
|
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
|
||||||
content = f.read()
|
content = content.replace('```markdown', '\n').replace('```', '\n')
|
||||||
# 将公式中的\[ \]替换成$$
|
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
|
||||||
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
|
# 生成在线预览html
|
||||||
# 将公式中的\( \)替换成$
|
file_name = '在线预览翻译' + gen_time_str() + '.html'
|
||||||
content = content.replace(r'\(', r'$').replace(r'\)', r'$')
|
# with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
|
||||||
content = content.replace('```', '\n').replace('```markdown', '\n')
|
# html_template = f.read()
|
||||||
with open(generated_fp, 'w', encoding='utf8') as f:
|
# html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
|
||||||
f.write(content)
|
preview_fp = os.path.join(ex_folder, file_name)
|
||||||
|
# with open(preview_fp, 'w', encoding='utf8') as f:
|
||||||
|
# f.write(html_template)
|
||||||
|
# 生成在线预览html
|
||||||
|
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
||||||
|
with open(generated_fp, "r", encoding="utf-8") as f:
|
||||||
|
md = f.read()
|
||||||
|
html = markdown_convertion_for_file(md)
|
||||||
|
# print(html)
|
||||||
|
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
|
||||||
|
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
||||||
# 生成包含图片的压缩包
|
# 生成包含图片的压缩包
|
||||||
dest_folder = get_log_folder(chatbot.get_user())
|
dest_folder = get_log_folder(chatbot.get_user())
|
||||||
zip_name = '翻译后的带图文档.zip'
|
zip_name = '翻译后的带图文档.zip'
|
||||||
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
|
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
|
||||||
zip_fp = os.path.join(dest_folder, zip_name)
|
zip_fp = os.path.join(dest_folder, zip_name)
|
||||||
# 生成在线预览html
|
|
||||||
file_name = '在线预览翻译' + gen_time_str() + '.html'
|
|
||||||
with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
|
|
||||||
html_template = f.read()
|
|
||||||
html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
|
|
||||||
preview_fp = os.path.join(ex_folder, file_name)
|
|
||||||
with open(preview_fp, 'w', encoding='utf8') as f:
|
|
||||||
f.write(html_template)
|
|
||||||
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
|
||||||
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
|
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
md_zip_path = yield from pdf2markdown(fp)
|
md_zip_path = yield from pdf2markdown(fp)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import glob, shutil, os, re, logging
|
import glob, shutil, os, re, logging
|
||||||
from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
|
from toolbox import update_ui, trimmed_format_exc, gen_time_str
|
||||||
from toolbox import CatchException, report_exception, get_log_folder
|
from toolbox import CatchException, report_exception, get_log_folder
|
||||||
from toolbox import write_history_to_file, promote_file_to_downloadzone
|
from toolbox import write_history_to_file, promote_file_to_downloadzone
|
||||||
fast_debug = False
|
fast_debug = False
|
||||||
@ -18,7 +18,7 @@ class PaperFileGroup():
|
|||||||
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
|
||||||
self.get_token_num = get_token_num
|
self.get_token_num = get_token_num
|
||||||
|
|
||||||
def run_file_split(self, max_token_limit=1900):
|
def run_file_split(self, max_token_limit=2048):
|
||||||
"""
|
"""
|
||||||
将长文本分离开来
|
将长文本分离开来
|
||||||
"""
|
"""
|
||||||
@ -64,17 +64,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
|
|||||||
pfg.file_contents.append(file_content)
|
pfg.file_contents.append(file_content)
|
||||||
|
|
||||||
# <-------- 拆分过长的Markdown文件 ---------->
|
# <-------- 拆分过长的Markdown文件 ---------->
|
||||||
pfg.run_file_split(max_token_limit=1500)
|
pfg.run_file_split(max_token_limit=2048)
|
||||||
n_split = len(pfg.sp_file_contents)
|
n_split = len(pfg.sp_file_contents)
|
||||||
|
|
||||||
# <-------- 多线程翻译开始 ---------->
|
# <-------- 多线程翻译开始 ---------->
|
||||||
if language == 'en->zh':
|
if language == 'en->zh':
|
||||||
inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" +
|
inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
|
||||||
f"\n\n{frag}" for frag in pfg.sp_file_contents]
|
f"\n\n{frag}" for frag in pfg.sp_file_contents]
|
||||||
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
|
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
|
||||||
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
|
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
|
||||||
elif language == 'zh->en':
|
elif language == 'zh->en':
|
||||||
inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" +
|
inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
|
||||||
f"\n\n{frag}" for frag in pfg.sp_file_contents]
|
f"\n\n{frag}" for frag in pfg.sp_file_contents]
|
||||||
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
|
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
|
||||||
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
|
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
|
||||||
@ -164,7 +164,6 @@ def Markdown英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
|
|||||||
"函数插件功能?",
|
"函数插件功能?",
|
||||||
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
disable_auto_promotion(chatbot)
|
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
||||||
try:
|
try:
|
||||||
@ -204,7 +203,6 @@ def Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
|
|||||||
"函数插件功能?",
|
"函数插件功能?",
|
||||||
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
disable_auto_promotion(chatbot)
|
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
||||||
try:
|
try:
|
||||||
@ -237,7 +235,6 @@ def Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history,
|
|||||||
"函数插件功能?",
|
"函数插件功能?",
|
||||||
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
disable_auto_promotion(chatbot)
|
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
||||||
try:
|
try:
|
||||||
|
@ -207,6 +207,40 @@ def fix_code_segment_indent(txt):
|
|||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
||||||
|
def markdown_convertion_for_file(txt):
|
||||||
|
"""
|
||||||
|
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
|
||||||
|
"""
|
||||||
|
pre = '<div class="markdown-body">'
|
||||||
|
suf = "</div>"
|
||||||
|
if txt.startswith(pre) and txt.endswith(suf):
|
||||||
|
# print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
|
||||||
|
return txt # 已经被转化过,不需要再次转化
|
||||||
|
|
||||||
|
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
||||||
|
txt = fix_markdown_indent(txt)
|
||||||
|
# convert everything to html format
|
||||||
|
split = markdown.markdown(text="---")
|
||||||
|
convert_stage_1 = markdown.markdown(
|
||||||
|
text=txt,
|
||||||
|
extensions=[
|
||||||
|
"sane_lists",
|
||||||
|
"tables",
|
||||||
|
"mdx_math",
|
||||||
|
"pymdownx.superfences",
|
||||||
|
"pymdownx.highlight",
|
||||||
|
],
|
||||||
|
extension_configs={**markdown_extension_configs, **code_highlight_configs},
|
||||||
|
)
|
||||||
|
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
|
||||||
|
|
||||||
|
# 2. convert to rendered equation
|
||||||
|
convert_stage_2_2, n = re.subn(
|
||||||
|
find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
|
||||||
|
)
|
||||||
|
# cat them together
|
||||||
|
return pre + convert_stage_2_2 + suf
|
||||||
|
|
||||||
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
||||||
def markdown_convertion(txt):
|
def markdown_convertion(txt):
|
||||||
"""
|
"""
|
||||||
|
@ -43,8 +43,10 @@ def validate_path():
|
|||||||
|
|
||||||
validate_path() # validate path so you can run from base directory
|
validate_path() # validate path so you can run from base directory
|
||||||
from toolbox import markdown_convertion
|
from toolbox import markdown_convertion
|
||||||
|
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
||||||
html = markdown_convertion(md)
|
with open("gpt_log/default_user/shared/2024-04-22-01-27-43.zip.extract/translated_markdown.md", "r", encoding="utf-8") as f:
|
||||||
|
md = f.read()
|
||||||
|
html = markdown_convertion_for_file(md)
|
||||||
# print(html)
|
# print(html)
|
||||||
with open("test.html", "w", encoding="utf-8") as f:
|
with open("test.html", "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user