给nougat加线程锁 合并冗余代码
This commit is contained in:
parent
fb5467b85b
commit
14de282302
@ -1,5 +1,6 @@
|
|||||||
from toolbox import update_ui, get_conf, trimmed_format_exc
|
from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder
|
||||||
import threading
|
import threading
|
||||||
|
import os
|
||||||
|
|
||||||
def input_clipping(inputs, history, max_token_limit):
|
def input_clipping(inputs, history, max_token_limit):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -705,6 +706,40 @@ class knowledge_archive_interface():
|
|||||||
)
|
)
|
||||||
self.threadLock.release()
|
self.threadLock.release()
|
||||||
return resp, prompt
|
return resp, prompt
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
class nougat_interface():
|
||||||
|
def __init__(self):
|
||||||
|
self.threadLock = threading.Lock()
|
||||||
|
|
||||||
|
def nougat_with_timeout(self, command, cwd, timeout=3600):
|
||||||
|
import subprocess
|
||||||
|
process = subprocess.Popen(command, shell=True, cwd=cwd)
|
||||||
|
try:
|
||||||
|
stdout, stderr = process.communicate(timeout=timeout)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
process.kill()
|
||||||
|
stdout, stderr = process.communicate()
|
||||||
|
print("Process timed out!")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def NOUGAT_parse_pdf(self, fp):
|
||||||
|
self.threadLock.acquire()
|
||||||
|
import glob, threading, os
|
||||||
|
from toolbox import get_log_folder, gen_time_str
|
||||||
|
dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
|
||||||
|
os.makedirs(dst)
|
||||||
|
self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd())
|
||||||
|
res = glob.glob(os.path.join(dst,'*.mmd'))
|
||||||
|
if len(res) == 0:
|
||||||
|
raise RuntimeError("Nougat解析论文失败。")
|
||||||
|
self.threadLock.release()
|
||||||
|
return res[0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def try_install_deps(deps, reload_m=[]):
|
def try_install_deps(deps, reload_m=[]):
|
||||||
import subprocess, sys, importlib
|
import subprocess, sys, importlib
|
||||||
@ -715,42 +750,43 @@ def try_install_deps(deps, reload_m=[]):
|
|||||||
for m in reload_m:
|
for m in reload_m:
|
||||||
importlib.reload(__import__(m))
|
importlib.reload(__import__(m))
|
||||||
|
|
||||||
class construct_html():
|
|
||||||
def __init__(self) -> None:
|
HTML_CSS = """
|
||||||
self.css = """
|
|
||||||
.row {
|
.row {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
}
|
}
|
||||||
|
|
||||||
.column {
|
.column {
|
||||||
flex: 1;
|
flex: 1;
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.table-header {
|
.table-header {
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
border-bottom: 1px solid black;
|
border-bottom: 1px solid black;
|
||||||
}
|
}
|
||||||
|
|
||||||
.table-row {
|
.table-row {
|
||||||
border-bottom: 1px solid lightgray;
|
border-bottom: 1px solid lightgray;
|
||||||
}
|
}
|
||||||
|
|
||||||
.table-cell {
|
.table-cell {
|
||||||
padding: 5px;
|
padding: 5px;
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
|
||||||
|
|
||||||
|
TABLE_CSS = """
|
||||||
def add_row(self, a, b):
|
|
||||||
tmp = """
|
|
||||||
<div class="row table-row">
|
<div class="row table-row">
|
||||||
<div class="column table-cell">REPLACE_A</div>
|
<div class="column table-cell">REPLACE_A</div>
|
||||||
<div class="column table-cell">REPLACE_B</div>
|
<div class="column table-cell">REPLACE_B</div>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
class construct_html():
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.css = HTML_CSS
|
||||||
|
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
||||||
|
|
||||||
|
|
||||||
|
def add_row(self, a, b):
|
||||||
|
tmp = TABLE_CSS
|
||||||
from toolbox import markdown_convertion
|
from toolbox import markdown_convertion
|
||||||
tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
|
tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
|
||||||
tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
|
tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
|
||||||
@ -758,6 +794,6 @@ class construct_html():
|
|||||||
|
|
||||||
|
|
||||||
def save_file(self, file_name):
|
def save_file(self, file_name):
|
||||||
with open(f'./gpt_log/{file_name}', 'w', encoding='utf8') as f:
|
with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
|
||||||
f.write(self.html_string.encode('utf-8', 'ignore').decode())
|
f.write(self.html_string.encode('utf-8', 'ignore').decode())
|
||||||
|
return os.path.join(get_log_folder(), file_name)
|
||||||
|
@ -86,31 +86,8 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
|
|||||||
# 开始正式执行任务
|
# 开始正式执行任务
|
||||||
yield from 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
yield from 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
||||||
|
|
||||||
|
|
||||||
def nougat_with_timeout(command, cwd, timeout=3600):
|
|
||||||
import subprocess
|
|
||||||
process = subprocess.Popen(command, shell=True, cwd=cwd)
|
|
||||||
try:
|
|
||||||
stdout, stderr = process.communicate(timeout=timeout)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
process.kill()
|
|
||||||
stdout, stderr = process.communicate()
|
|
||||||
print("Process timed out!")
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def NOUGAT_parse_pdf(fp):
|
|
||||||
import glob
|
|
||||||
from toolbox import get_log_folder, gen_time_str
|
|
||||||
dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
|
|
||||||
os.makedirs(dst)
|
|
||||||
nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd())
|
|
||||||
res = glob.glob(os.path.join(dst,'*.mmd'))
|
|
||||||
if len(res) == 0:
|
|
||||||
raise RuntimeError("Nougat解析论文失败。")
|
|
||||||
return res[0]
|
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
||||||
import copy
|
import copy
|
||||||
@ -119,9 +96,11 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
|
|||||||
generated_conclusion_files = []
|
generated_conclusion_files = []
|
||||||
generated_html_files = []
|
generated_html_files = []
|
||||||
DST_LANG = "中文"
|
DST_LANG = "中文"
|
||||||
|
from crazy_functions.crazy_utils import nougat_interface, construct_html
|
||||||
|
nougat_handle = nougat_interface()
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
chatbot.append(["当前进度:", f"正在解析论文,请稍候。(第一次运行时,需要花费较长时间下载NOUGAT参数)"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
chatbot.append(["当前进度:", f"正在解析论文,请稍候。(第一次运行时,需要花费较长时间下载NOUGAT参数)"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
fpp = NOUGAT_parse_pdf(fp)
|
fpp = nougat_handle.NOUGAT_parse_pdf(fp)
|
||||||
|
|
||||||
with open(fpp, 'r', encoding='utf8') as f:
|
with open(fpp, 'r', encoding='utf8') as f:
|
||||||
article_content = f.readlines()
|
article_content = f.readlines()
|
||||||
@ -222,50 +201,3 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
|
|||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class construct_html():
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.css = """
|
|
||||||
.row {
|
|
||||||
display: flex;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
.column {
|
|
||||||
flex: 1;
|
|
||||||
padding: 10px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-header {
|
|
||||||
font-weight: bold;
|
|
||||||
border-bottom: 1px solid black;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-row {
|
|
||||||
border-bottom: 1px solid lightgray;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-cell {
|
|
||||||
padding: 5px;
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
|
||||||
|
|
||||||
|
|
||||||
def add_row(self, a, b):
|
|
||||||
tmp = """
|
|
||||||
<div class="row table-row">
|
|
||||||
<div class="column table-cell">REPLACE_A</div>
|
|
||||||
<div class="column table-cell">REPLACE_B</div>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
from toolbox import markdown_convertion
|
|
||||||
tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
|
|
||||||
tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
|
|
||||||
self.html_string += tmp
|
|
||||||
|
|
||||||
|
|
||||||
def save_file(self, file_name):
|
|
||||||
with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
|
|
||||||
f.write(self.html_string.encode('utf-8', 'ignore').decode())
|
|
||||||
return os.path.join(get_log_folder(), file_name)
|
|
||||||
|
@ -63,6 +63,7 @@ def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwa
|
|||||||
generated_conclusion_files = []
|
generated_conclusion_files = []
|
||||||
generated_html_files = []
|
generated_html_files = []
|
||||||
DST_LANG = "中文"
|
DST_LANG = "中文"
|
||||||
|
from crazy_functions.crazy_utils import construct_html
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
chatbot.append(["当前进度:", f"正在连接GROBID服务,请稍候: {grobid_url}\n如果等待时间过长,请修改config中的GROBID_URL,可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
chatbot.append(["当前进度:", f"正在连接GROBID服务,请稍候: {grobid_url}\n如果等待时间过长,请修改config中的GROBID_URL,可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
article_dict = parse_pdf(fp, grobid_url)
|
article_dict = parse_pdf(fp, grobid_url)
|
||||||
@ -166,6 +167,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
|
|||||||
TOKEN_LIMIT_PER_FRAGMENT = 1280
|
TOKEN_LIMIT_PER_FRAGMENT = 1280
|
||||||
generated_conclusion_files = []
|
generated_conclusion_files = []
|
||||||
generated_html_files = []
|
generated_html_files = []
|
||||||
|
from crazy_functions.crazy_utils import construct_html
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
# 读取PDF文件
|
# 读取PDF文件
|
||||||
file_content, page_one = read_and_clean_pdf_text(fp)
|
file_content, page_one = read_and_clean_pdf_text(fp)
|
||||||
@ -261,49 +263,3 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
|
|||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
|
|
||||||
class construct_html():
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.css = """
|
|
||||||
.row {
|
|
||||||
display: flex;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
.column {
|
|
||||||
flex: 1;
|
|
||||||
padding: 10px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-header {
|
|
||||||
font-weight: bold;
|
|
||||||
border-bottom: 1px solid black;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-row {
|
|
||||||
border-bottom: 1px solid lightgray;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table-cell {
|
|
||||||
padding: 5px;
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
|
||||||
|
|
||||||
|
|
||||||
def add_row(self, a, b):
|
|
||||||
tmp = """
|
|
||||||
<div class="row table-row">
|
|
||||||
<div class="column table-cell">REPLACE_A</div>
|
|
||||||
<div class="column table-cell">REPLACE_B</div>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
from toolbox import markdown_convertion
|
|
||||||
tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
|
|
||||||
tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
|
|
||||||
self.html_string += tmp
|
|
||||||
|
|
||||||
|
|
||||||
def save_file(self, file_name):
|
|
||||||
with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
|
|
||||||
f.write(self.html_string.encode('utf-8', 'ignore').decode())
|
|
||||||
return os.path.join(get_log_folder(), file_name)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user