Latex全文润色

This commit is contained in:
qingxu fu 2023-04-09 23:28:57 +08:00
parent 3725122de1
commit 49a6ff6a7c
9 changed files with 180 additions and 50 deletions

View File

@ -72,7 +72,7 @@ def patch_and_restart(path):
time.sleep(1) time.sleep(1)
print(i) print(i)
print(' ------------------------------ -----------------------------------') print(' ------------------------------ -----------------------------------')
os.execl(sys.executable, 'python', 'main.py') os.execl(sys.executable, *sys.argv)
def get_current_version(): def get_current_version():

View File

@ -31,7 +31,7 @@ CODE_HIGHLIGHT = True
LAYOUT = "LEFT-RIGHT" # "LEFT-RIGHT"(左右布局) # "TOP-DOWN"(上下布局) LAYOUT = "LEFT-RIGHT" # "LEFT-RIGHT"(左右布局) # "TOP-DOWN"(上下布局)
# 发送请求到OpenAI后等待多久判定为超时 # 发送请求到OpenAI后等待多久判定为超时
TIMEOUT_SECONDS = 25 TIMEOUT_SECONDS = 30
# 网页的端口, -1代表随机端口 # 网页的端口, -1代表随机端口
WEB_PORT = -1 WEB_PORT = -1

View File

@ -76,6 +76,8 @@ def get_crazy_functions():
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容 from crazy_functions.理解PDF文档内容 import 理解PDF文档内容
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入 from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
from crazy_functions.Latex全文润色 import Latex英文润色
from crazy_functions.Latex全文润色 import Latex中文润色
function_plugins.update({ function_plugins.update({
"批量翻译PDF文档多线程": { "批量翻译PDF文档多线程": {
@ -114,6 +116,18 @@ def get_crazy_functions():
"AsButton": False, # 加入下拉菜单中 "AsButton": False, # 加入下拉菜单中
"Function": HotReload(理解PDF文档内容标准文件输入) "Function": HotReload(理解PDF文档内容标准文件输入)
}, },
"英文Latex项目全文润色输入路径或上传压缩包": {
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Function": HotReload(Latex英文润色)
},
# "中文Latex项目全文润色输入路径或上传压缩包": {
# # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
# "Color": "stop",
# "AsButton": False, # 加入下拉菜单中
# "Function": HotReload(Latex中文润色)
# },
}) })
###################### 第三组插件 ########################### ###################### 第三组插件 ###########################

View File

@ -2,51 +2,125 @@ from toolbox import update_ui
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
fast_debug = False fast_debug = False
class PaperFileGroup():
def __init__(self):
self.file_paths = []
self.file_contents = []
self.sp_file_contents = []
self.sp_file_index = []
self.sp_file_tag = []
# count_token
import tiktoken
from toolbox import get_conf
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
def get_token_num(txt): return len(enc.encode(txt))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=1900):
"""
将长文本分离开来
"""
for index, file_content in enumerate(self.file_contents):
if self.get_token_num(file_content) < max_token_limit:
self.sp_file_contents.append(file_content)
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index])
else:
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
for j, segment in enumerate(segments):
self.sp_file_contents.append(segment)
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
print('Segmentation: done')
def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'):
import time, os, re
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
# <-------- 读取Latex文件删除其中的所有注释 ---------->
pfg = PaperFileGroup()
def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import time, glob, os
print('begin analysis on:', file_manifest)
for index, fp in enumerate(file_manifest): for index, fp in enumerate(file_manifest):
with open(fp, 'r', encoding='utf-8') as f: with open(fp, 'r', encoding='utf-8') as f:
file_content = f.read() file_content = f.read()
# 定义注释的正则表达式
comment_pattern = r'%.*'
# 使用正则表达式查找注释,并替换为空字符串
clean_tex_content = re.sub(comment_pattern, '', file_content)
# 记录删除注释后的文本
pfg.file_paths.append(fp)
pfg.file_contents.append(clean_tex_content)
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else "" # <-------- 拆分过长的latex文件 ---------->
i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```' pfg.run_file_split(max_token_limit=1024)
i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}' n_split = len(pfg.sp_file_contents)
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if not fast_debug: # <-------- 抽取摘要 ---------->
msg = '正常' # if language == 'en':
# ** gpt request ** # abs_extract_inputs = f"Please write an abstract for this paper"
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, llm_kwargs, plugin_kwargs, history=[]) # 带超时倒计时
chatbot[-1] = (i_say_show_user, gpt_say) # # 单线获取文章meta信息
history.append(i_say_show_user); history.append(gpt_say) # paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive(
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面 # inputs=abs_extract_inputs,
if not fast_debug: time.sleep(2) # inputs_show_user=f"正在抽取摘要信息。",
# llm_kwargs=llm_kwargs,
# chatbot=chatbot, history=[],
# sys_prompt="Your job is to collect information from materials。",
# )
all_file = ', '.join([os.path.relpath(fp, project_folder) for index, fp in enumerate(file_manifest)]) # <-------- 多线程润色开始(第一次) ---------->
i_say = f'根据以上你自己的分析,对全文进行概括,用学术性语言写一段中文摘要,然后再写一段英文摘要(包括{all_file})。' if language == 'en':
chatbot.append((i_say, "[Local Message] waiting gpt response.")) inputs_array = [f"Below is an academic paper, polish the writing to meet the academic style, "+
f"improve the spelling, grammar, clarity, concision and overall readability. " +
f"The paper begins now: \n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"Polish {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
elif language == 'zh':
inputs_array = [f"这里有一个使用Latex格式的学术论文请把写作风格要求的学术风格进行润色改进拼写、语法、清晰度、简洁度和整体可读性。" +
f"论文现在开始:\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag]
sys_prompt_array=["你是一位专业的学术论文作家。润色以下论文。输出中保留Latex格式。" for _ in range(n_split)]
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
inputs_show_user_array=inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(n_split)],
sys_prompt_array=sys_prompt_array,
max_workers=10, # OpenAI所允许的最大并行过载
scroller_max_len = 80
)
create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"
res = write_results_to_file(gpt_response_collection, file_name=create_report_file_name)
history = gpt_response_collection
chatbot.append((f"{fp}完成了吗?", res))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if not fast_debug:
msg = '正常'
# ** gpt request **
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, llm_kwargs, plugin_kwargs, history=history) # 带超时倒计时
chatbot[-1] = (i_say, gpt_say)
history.append(i_say); history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面
res = write_results_to_file(history)
chatbot.append(("完成了吗?", res))
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面
@CatchException @CatchException
def 读文章写摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): def Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"对整个Latex项目进行润色。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import tiktoken
except:
report_execption(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
history = [] # 清空历史,以免输入溢出 history = [] # 清空历史,以免输入溢出
import glob, os import glob, os
if os.path.exists(txt): if os.path.exists(txt):
@ -56,11 +130,47 @@ def 读文章写摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return return
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] # + \ file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
# [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
# [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
if len(file_manifest) == 0: if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return return
yield from 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en')
@CatchException
def Latex中文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"对整个Latex项目进行润色。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import tiktoken
except:
report_execption(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
history = [] # 清空历史,以免输入溢出
import glob, os
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
if len(file_manifest) == 0:
report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh')

View File

@ -80,7 +80,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
history=history, sys_prompt=sys_prompt, observe_window=mutable) history=history, sys_prompt=sys_prompt, observe_window=mutable)
return result return result
except ConnectionAbortedError as token_exceeded_error: except ConnectionAbortedError as token_exceeded_error:
# 【第二种情况】Token溢出 # 【第二种情况】Token溢出
if handle_token_exceed: if handle_token_exceed:
exceeded_cnt += 1 exceeded_cnt += 1
# 【选择处理】 尝试计算比例,尽可能多地保留文本 # 【选择处理】 尝试计算比例,尽可能多地保留文本
@ -97,7 +97,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n" mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n"
return mutable[0] # 放弃 return mutable[0] # 放弃
except: except:
# 【第三种情况】:其他错误 # 【第三种情况】:其他错误:重试几次
tb_str = '```\n' + traceback.format_exc() + '```' tb_str = '```\n' + traceback.format_exc() + '```'
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n" mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n"
if retry_op > 0: if retry_op > 0:
@ -119,7 +119,11 @@ def request_gpt_model_in_new_thread_with_ui_alive(
break break
chatbot[-1] = [chatbot[-1][0], mutable[0]] chatbot[-1] = [chatbot[-1][0], mutable[0]]
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
return future.result()
final_result = future.result()
chatbot[-1] = [chatbot[-1][0], final_result]
yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
return final_result
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(

View File

@ -96,7 +96,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
# 基本信息:功能、贡献者 # 基本信息:功能、贡献者
chatbot.append([ chatbot.append([
"函数插件功能?", "函数插件功能?",
"批量总结PDF文档。函数插件贡献者: Binary-Husky(二进制哈士奇)"]) "批量总结PDF文档。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议 # 尝试导入依赖,如果缺少依赖,则给出安装建议
@ -185,11 +185,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
final.extend(gpt_response_collection) final.extend(gpt_response_collection)
create_report_file_name = f"{os.path.basename(fp)}.trans.md" create_report_file_name = f"{os.path.basename(fp)}.trans.md"
res = write_results_to_file(final, file_name=create_report_file_name) res = write_results_to_file(final, file_name=create_report_file_name)
generated_conclusion_files.append( generated_conclusion_files.append(f'./gpt_log/{create_report_file_name}')
f'./gpt_log/{create_report_file_name}')
chatbot.append((f"{fp}完成了吗?", res)) chatbot.append((f"{fp}完成了吗?", res))
msg = "完成" yield from update_ui(chatbot=chatbot, history=chatbot) # 刷新界面
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg) # 刷新界面
# 准备文件的下载 # 准备文件的下载
import shutil import shutil

View File

@ -45,6 +45,7 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
batchsize = 16 # 10个文件为一组 batchsize = 16 # 10个文件为一组
report_part_2 = [] report_part_2 = []
previous_iteration_files = [] previous_iteration_files = []
last_iteration_result = ""
while True: while True:
if len(file_manifest) == 0: break if len(file_manifest) == 0: break
this_iteration_file_manifest = file_manifest[:batchsize] this_iteration_file_manifest = file_manifest[:batchsize]
@ -59,12 +60,13 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
i_say = f'根据以上分析对程序的整体功能和构架重新做出概括。然后用一张markdown表格整理每个文件的功能包括{previous_iteration_files_string})。' i_say = f'根据以上分析对程序的整体功能和构架重新做出概括。然后用一张markdown表格整理每个文件的功能包括{previous_iteration_files_string})。'
inputs_show_user = f'根据以上分析,对程序的整体功能和构架重新做出概括,由于输入长度限制,可能需要分组处理,本组文件为 {current_iteration_focus} + 已经汇总的文件组。' inputs_show_user = f'根据以上分析,对程序的整体功能和构架重新做出概括,由于输入长度限制,可能需要分组处理,本组文件为 {current_iteration_focus} + 已经汇总的文件组。'
this_iteration_history = copy.deepcopy(this_iteration_gpt_response_collection) this_iteration_history = copy.deepcopy(this_iteration_gpt_response_collection)
this_iteration_history.extend(report_part_2) this_iteration_history.extend(last_iteration_result)
result = yield from request_gpt_model_in_new_thread_with_ui_alive( result = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user=inputs_show_user, llm_kwargs=llm_kwargs, chatbot=chatbot, inputs=i_say, inputs_show_user=inputs_show_user, llm_kwargs=llm_kwargs, chatbot=chatbot,
history=this_iteration_history, # 迭代之前的分析 history=this_iteration_history, # 迭代之前的分析
sys_prompt="你是一个程序架构分析师,正在分析一个源代码项目") sys_prompt="你是一个程序架构分析师,正在分析一个项目的源代码。")
report_part_2.extend([i_say, result]) report_part_2.extend([i_say, result])
last_iteration_result = result
file_manifest = file_manifest[batchsize:] file_manifest = file_manifest[batchsize:]
gpt_response_collection = gpt_response_collection[batchsize*2:] gpt_response_collection = gpt_response_collection[batchsize*2:]

View File

@ -154,10 +154,12 @@ advanced_css = """
padding: 1em; padding: 1em;
margin: 1em 2em 1em 0.5em; margin: 1em 2em 1em 0.5em;
} }
""" """
if CODE_HIGHLIGHT: if CODE_HIGHLIGHT:
advanced_css += """ advanced_css += """
.hll { background-color: #ffffcc } .hll { background-color: #ffffcc }
.c { color: #3D7B7B; font-style: italic } /* Comment */ .c { color: #3D7B7B; font-style: italic } /* Comment */
.err { border: 1px solid #FF0000 } /* Error */ .err { border: 1px solid #FF0000 } /* Error */

View File

@ -1,5 +1,5 @@
{ {
"version": 2.6, "version": 2.6,
"show_feature": true, "show_feature": true,
"new_feature": "增强多线程稳定性涉及代码解析、PDF翻译等<->修复Token计数错误解决PDF翻译的分割不合理的问题" "new_feature": "现可通过输入区更新临时api-key <-> 增强多线程稳定性涉及代码解析、PDF翻译等 <-> 修复Token计数错误解决PDF翻译的分割不合理的问题 "
} }