chatgpt_academic/crazy_functions/批量总结Markdown文档_进阶.py
leike0813 68a49d3758 Add 2 plugins
相当于将“批量总结PDF文档”插件拆成了两部分,目的在于使用廉价的模型干粗活,再将关键的最终总结交给GPT-4,降低使用成本
批量总结PDF文档_初步:初步总结PDF,每个PDF输出一个md文档
批量总结Markdown文档_进阶:将所有md文档高度凝练并汇总至一个md文档,可直接使用“批量总结PDF文档_初步”的输出结果作为输入
2023-12-20 07:44:53 +08:00

128 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging, os
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
from toolbox import CatchException, report_exception, trimmed_format_exc
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import input_clipping
def 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
file_write_buffer = []
SUMMARY_WORD_LIMIT = 800
meta_inputs_array = []
meta_inputs_show_user_array = []
meta_sys_prompt_array = []
inputs_array = []
inputs_show_user_array = []
sys_prompt_array = []
file_name_array = []
for idx, file_name in enumerate(file_manifest):
print('begin analysis on:', file_name)
file_name_array.append(f'# {idx}.{os.path.basename(file_name)}')
with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()
_ = file_content.split('## metadata')
if len(_) >= 2:
file_meta = _[-2]
file_content = _[-1]
else:
file_meta = file_name
meta_inputs_array.append(
"我需要你从一段文本中识别并提取出这篇文章的1.标题、2.作者、3.作者单位、4.关键词。"
"其中1.标题和4.关键词需要给出中文和英文的双语结果2.作者和3.作者单位按原文语言给出。"
"以下是需要你识别的文本: " + file_meta
)
meta_inputs_show_user_array.append(
'开始分析元数据:' + file_name
)
meta_sys_prompt_array.append("As an academic professional, you need to extract basic informations of the paper from its metadata")
inputs_array.append(
"我需要你根据我提供的文本总结一份Markdown文档分为四个部分1.研究背景2.文章主要内容3.主要创新点4.结论。"
+ f"各部分的题目采用二级标题前缀(## ),内容可适当的分为若干条,总字数不超过{SUMMARY_WORD_LIMIT}个中文字符."
+ "以下是需要你处理的文本: " + file_content)
inputs_show_user_array.append('开始总结:' + file_name)
sys_prompt_array.append(f"As an academic professional, you need to summarize the text with less than {SUMMARY_WORD_LIMIT} Chinese characters")
gpt_meta_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=meta_inputs_array,
inputs_show_user_array=meta_inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(len(inputs_array))],
sys_prompt_array=meta_sys_prompt_array,
# max_workers=5, # OpenAI所允许的最大并行过载
scroller_max_len=80
)
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
inputs_show_user_array=inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(len(inputs_array))],
sys_prompt_array=sys_prompt_array,
# max_workers=5, # OpenAI所允许的最大并行过载
scroller_max_len=80
)
try:
for idx, (gpt_say_meta, gpt_say) in enumerate(zip(gpt_meta_response_collection[1::2], gpt_response_collection[1::2])):
file_write_buffer.append(file_name_array[idx])
file_write_buffer.append("## 元数据\n\n" + gpt_say_meta)
file_write_buffer.append(gpt_say)
except:
logging.error(trimmed_format_exc())
res = write_history_to_file(file_write_buffer, file_basename="result.md", auto_caption=False)
promote_file_to_downloadzone(res, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=gpt_response_collection) # 刷新界面
@CatchException
def 批量总结Markdown文档_进阶(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
import glob, os
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结Markdown文档。函数插件贡献者: ValeriaWongEralienJoshua Reed"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import fitz
except:
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 搜索需要处理的文件清单
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.md', recursive=True)]
# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 开始正式执行任务
yield from 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)