Add 2 plugins
相当于将“批量总结PDF文档”插件拆成了两部分,目的在于使用廉价的模型干粗活,再将关键的最终总结交给GPT-4,降低使用成本 批量总结PDF文档_初步:初步总结PDF,每个PDF输出一个md文档 批量总结Markdown文档_进阶:将所有md文档高度凝练并汇总至一个md文档,可直接使用“批量总结PDF文档_初步”的输出结果作为输入
This commit is contained in:
		
							parent
							
								
									ac3d4cf073
								
							
						
					
					
						commit
						68a49d3758
					
				@ -603,7 +603,35 @@ def get_crazy_functions():
 | 
			
		||||
    except:
 | 
			
		||||
        print(trimmed_format_exc())
 | 
			
		||||
        print('Load function plugin failed')
 | 
			
		||||
    try:
 | 
			
		||||
        from crazy_functions.批量总结PDF文档_初步 import 批量总结PDF文档_初步
 | 
			
		||||
        function_plugins.update({
 | 
			
		||||
            "批量总结PDF文档_初步": {
 | 
			
		||||
                "Group": "学术",
 | 
			
		||||
                "Color": "stop",
 | 
			
		||||
                "AsButton": False,
 | 
			
		||||
                "Info": "批量总结PDF文档的内容(仅做初步提炼) | 输入参数为路径",
 | 
			
		||||
                "Function": HotReload(批量总结PDF文档_初步)
 | 
			
		||||
            }
 | 
			
		||||
        })
 | 
			
		||||
    except:
 | 
			
		||||
        print(trimmed_format_exc())
 | 
			
		||||
        print('Load function plugin failed')
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        from crazy_functions.批量总结Markdown文档_进阶 import 批量总结Markdown文档_进阶
 | 
			
		||||
        function_plugins.update({
 | 
			
		||||
            "批量总结Markdown文档_进阶": {
 | 
			
		||||
                "Group": "学术",
 | 
			
		||||
                "Color": "stop",
 | 
			
		||||
                "AsButton": False,
 | 
			
		||||
                "Info": "批量总结Markdown文档的内容(在初步提炼的基础上进一步总结) | 输入参数为路径",
 | 
			
		||||
                "Function": HotReload(批量总结Markdown文档_进阶)
 | 
			
		||||
            }
 | 
			
		||||
        })
 | 
			
		||||
    except:
 | 
			
		||||
        print(trimmed_format_exc())
 | 
			
		||||
        print('Load function plugin failed')
 | 
			
		||||
    # try:
 | 
			
		||||
    #     from crazy_functions.chatglm微调工具 import 微调数据集生成
 | 
			
		||||
    #     function_plugins.update({
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										127
									
								
								crazy_functions/批量总结Markdown文档_进阶.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								crazy_functions/批量总结Markdown文档_进阶.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,127 @@
 | 
			
		||||
import logging, os
 | 
			
		||||
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
 | 
			
		||||
from toolbox import CatchException, report_exception, trimmed_format_exc
 | 
			
		||||
from toolbox import write_history_to_file, promote_file_to_downloadzone
 | 
			
		||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 | 
			
		||||
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 | 
			
		||||
from .crazy_utils import input_clipping
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
 | 
			
		||||
    file_write_buffer = []
 | 
			
		||||
    SUMMARY_WORD_LIMIT = 800
 | 
			
		||||
    meta_inputs_array = []
 | 
			
		||||
    meta_inputs_show_user_array = []
 | 
			
		||||
    meta_sys_prompt_array = []
 | 
			
		||||
    inputs_array = []
 | 
			
		||||
    inputs_show_user_array = []
 | 
			
		||||
    sys_prompt_array = []
 | 
			
		||||
    file_name_array = []
 | 
			
		||||
    for idx, file_name in enumerate(file_manifest):
 | 
			
		||||
        print('begin analysis on:', file_name)
 | 
			
		||||
        file_name_array.append(f'# {idx}.{os.path.basename(file_name)}')
 | 
			
		||||
 | 
			
		||||
        with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
 | 
			
		||||
            file_content = f.read()
 | 
			
		||||
 | 
			
		||||
        _ = file_content.split('## metadata')
 | 
			
		||||
        if len(_) >= 2:
 | 
			
		||||
            file_meta = _[-2]
 | 
			
		||||
            file_content = _[-1]
 | 
			
		||||
        else:
 | 
			
		||||
            file_meta = file_name
 | 
			
		||||
 | 
			
		||||
        meta_inputs_array.append(
 | 
			
		||||
            "我需要你从一段文本中识别并提取出这篇文章的1.标题、2.作者、3.作者单位、4.关键词。"
 | 
			
		||||
            "其中,1.标题和4.关键词需要给出中文和英文的双语结果,2.作者和3.作者单位按原文语言给出。"
 | 
			
		||||
            "以下是需要你识别的文本: " + file_meta
 | 
			
		||||
        )
 | 
			
		||||
        meta_inputs_show_user_array.append(
 | 
			
		||||
            '开始分析元数据:' + file_name
 | 
			
		||||
        )
 | 
			
		||||
        meta_sys_prompt_array.append("As an academic professional, you need to extract basic informations of the paper from its metadata")
 | 
			
		||||
 | 
			
		||||
        inputs_array.append(
 | 
			
		||||
            "我需要你根据我提供的文本总结一份Markdown文档,分为四个部分:1.研究背景,2.文章主要内容,3.主要创新点,4.结论。"
 | 
			
		||||
            + f"各部分的题目采用二级标题前缀(## ),内容可适当的分为若干条,总字数不超过{SUMMARY_WORD_LIMIT}个中文字符."
 | 
			
		||||
            + "以下是需要你处理的文本: " + file_content)
 | 
			
		||||
        inputs_show_user_array.append('开始总结:' + file_name)
 | 
			
		||||
        sys_prompt_array.append(f"As an academic professional, you need to summarize the text with less than {SUMMARY_WORD_LIMIT} Chinese characters")
 | 
			
		||||
 | 
			
		||||
    gpt_meta_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
 | 
			
		||||
        inputs_array=meta_inputs_array,
 | 
			
		||||
        inputs_show_user_array=meta_inputs_show_user_array,
 | 
			
		||||
        llm_kwargs=llm_kwargs,
 | 
			
		||||
        chatbot=chatbot,
 | 
			
		||||
        history_array=[[""] for _ in range(len(inputs_array))],
 | 
			
		||||
        sys_prompt_array=meta_sys_prompt_array,
 | 
			
		||||
        # max_workers=5,  # OpenAI所允许的最大并行过载
 | 
			
		||||
        scroller_max_len=80
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
 | 
			
		||||
        inputs_array=inputs_array,
 | 
			
		||||
        inputs_show_user_array=inputs_show_user_array,
 | 
			
		||||
        llm_kwargs=llm_kwargs,
 | 
			
		||||
        chatbot=chatbot,
 | 
			
		||||
        history_array=[[""] for _ in range(len(inputs_array))],
 | 
			
		||||
        sys_prompt_array=sys_prompt_array,
 | 
			
		||||
        # max_workers=5,  # OpenAI所允许的最大并行过载
 | 
			
		||||
        scroller_max_len=80
 | 
			
		||||
    )
 | 
			
		||||
    try:
 | 
			
		||||
        for idx, (gpt_say_meta, gpt_say) in enumerate(zip(gpt_meta_response_collection[1::2], gpt_response_collection[1::2])):
 | 
			
		||||
            file_write_buffer.append(file_name_array[idx])
 | 
			
		||||
            file_write_buffer.append("## 元数据\n\n" + gpt_say_meta)
 | 
			
		||||
            file_write_buffer.append(gpt_say)
 | 
			
		||||
    except:
 | 
			
		||||
        logging.error(trimmed_format_exc())
 | 
			
		||||
 | 
			
		||||
    res = write_history_to_file(file_write_buffer, file_basename="result.md", auto_caption=False)
 | 
			
		||||
    promote_file_to_downloadzone(res, chatbot=chatbot)
 | 
			
		||||
    yield from update_ui(chatbot=chatbot, history=gpt_response_collection) # 刷新界面
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@CatchException
 | 
			
		||||
def 批量总结Markdown文档_进阶(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
 | 
			
		||||
    import glob, os
 | 
			
		||||
 | 
			
		||||
    # 基本信息:功能、贡献者
 | 
			
		||||
    chatbot.append([
 | 
			
		||||
        "函数插件功能?",
 | 
			
		||||
        "批量总结Markdown文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"])
 | 
			
		||||
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
 | 
			
		||||
    # 尝试导入依赖,如果缺少依赖,则给出安装建议
 | 
			
		||||
    try:
 | 
			
		||||
        import fitz
 | 
			
		||||
    except:
 | 
			
		||||
        report_exception(chatbot, history, 
 | 
			
		||||
            a = f"解析项目: {txt}", 
 | 
			
		||||
            b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 清空历史,以免输入溢出
 | 
			
		||||
    history = []
 | 
			
		||||
 | 
			
		||||
    # 检测输入参数,如没有给定输入参数,直接退出
 | 
			
		||||
    if os.path.exists(txt):
 | 
			
		||||
        project_folder = txt
 | 
			
		||||
    else:
 | 
			
		||||
        if txt == "": txt = '空空如也的输入栏'
 | 
			
		||||
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 搜索需要处理的文件清单
 | 
			
		||||
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.md', recursive=True)]
 | 
			
		||||
    
 | 
			
		||||
    # 如果没找到任何文件
 | 
			
		||||
    if len(file_manifest) == 0:
 | 
			
		||||
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 开始正式执行任务
 | 
			
		||||
    yield from 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
 | 
			
		||||
							
								
								
									
										131
									
								
								crazy_functions/批量总结PDF文档_初步.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								crazy_functions/批量总结PDF文档_初步.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,131 @@
 | 
			
		||||
import zipfile
 | 
			
		||||
import os
 | 
			
		||||
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
 | 
			
		||||
from toolbox import CatchException, report_exception
 | 
			
		||||
from toolbox import write_history_to_file, promote_file_to_downloadzone
 | 
			
		||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 | 
			
		||||
from .crazy_utils import read_and_clean_pdf_text
 | 
			
		||||
from .crazy_utils import input_clipping
 | 
			
		||||
pj = os.path.join
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def move_file_to_zip(file_path, zip_file):
 | 
			
		||||
    zip_file.write(file_path, os.path.basename(file_path))
 | 
			
		||||
    os.remove(file_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
 | 
			
		||||
    zip_file_path = pj(get_log_folder(), 'result.zip')
 | 
			
		||||
    with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
 | 
			
		||||
        for file_name in file_manifest:
 | 
			
		||||
            file_write_buffer = []
 | 
			
		||||
            print('begin analysis on:', file_name)
 | 
			
		||||
            ############################## <第 0 步,切割PDF> ##################################
 | 
			
		||||
            # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
 | 
			
		||||
            # 的长度必须小于 2500 个 Token
 | 
			
		||||
            file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF
 | 
			
		||||
            file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
 | 
			
		||||
            page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
 | 
			
		||||
 | 
			
		||||
            TOKEN_LIMIT_PER_FRAGMENT = 2500
 | 
			
		||||
 | 
			
		||||
            from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
			
		||||
            from request_llms.bridge_all import model_info
 | 
			
		||||
            enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
			
		||||
            def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
			
		||||
            paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
                txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
 | 
			
		||||
            page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
			
		||||
                txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
 | 
			
		||||
            # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
 | 
			
		||||
            paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
 | 
			
		||||
 | 
			
		||||
            ############################## <第 1 步,从摘要中提取高价值信息,放到history中> ##################################
 | 
			
		||||
            final_results = []
 | 
			
		||||
            final_results.append("## metadata\n\n" + paper_meta + "\n\n## metadata")
 | 
			
		||||
 | 
			
		||||
            ############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ##################################
 | 
			
		||||
            i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
 | 
			
		||||
            chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
 | 
			
		||||
 | 
			
		||||
            iteration_results = []
 | 
			
		||||
            last_iteration_result = paper_meta  # 初始值是摘要
 | 
			
		||||
            MAX_WORD_TOTAL = 4096 * 0.7
 | 
			
		||||
            n_fragment = len(paper_fragments)
 | 
			
		||||
            if n_fragment >= 20: print('文章极长,不能达到预期效果')
 | 
			
		||||
            for i in range(n_fragment):
 | 
			
		||||
                NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
 | 
			
		||||
                i_say = f"Read this section, recapitulate the content of this section in Chinese with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
 | 
			
		||||
                i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
 | 
			
		||||
                gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
 | 
			
		||||
                                                                                    llm_kwargs, chatbot,
 | 
			
		||||
                                                                                    history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
 | 
			
		||||
                                                                                    sys_prompt="Extract the main idea of this section with Chinese."  # 提示
 | 
			
		||||
                                                                                    )
 | 
			
		||||
                iteration_results.append(gpt_say)
 | 
			
		||||
                last_iteration_result = gpt_say
 | 
			
		||||
 | 
			
		||||
            ############################## <第 3 步,整理history,提取总结> ##################################
 | 
			
		||||
            final_results.extend(iteration_results)
 | 
			
		||||
            file_write_buffer.extend(final_results)
 | 
			
		||||
 | 
			
		||||
            ############################## <第 4 步,设置一个token上限> ##################################
 | 
			
		||||
            _, final_results = input_clipping("", final_results, max_token_limit=3200)
 | 
			
		||||
            yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
 | 
			
		||||
 | 
			
		||||
            res = write_history_to_file(
 | 
			
		||||
                file_write_buffer,
 | 
			
		||||
                file_basename=os.path.splitext(os.path.basename(file_name))[0] + '.md',
 | 
			
		||||
                auto_caption=False
 | 
			
		||||
            )
 | 
			
		||||
            if len(file_manifest) == 1:
 | 
			
		||||
                promote_file_to_downloadzone(res, chatbot=chatbot)
 | 
			
		||||
                return
 | 
			
		||||
            move_file_to_zip(res, zip_file)
 | 
			
		||||
 | 
			
		||||
    promote_file_to_downloadzone(zip_file_path, chatbot=chatbot)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@CatchException
 | 
			
		||||
def 批量总结PDF文档_初步(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
 | 
			
		||||
    import glob, os
 | 
			
		||||
 | 
			
		||||
    # 基本信息:功能、贡献者
 | 
			
		||||
    chatbot.append([
 | 
			
		||||
        "函数插件功能?",
 | 
			
		||||
        "批量总结PDF文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"])
 | 
			
		||||
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
 | 
			
		||||
    # 尝试导入依赖,如果缺少依赖,则给出安装建议
 | 
			
		||||
    try:
 | 
			
		||||
        import fitz
 | 
			
		||||
    except:
 | 
			
		||||
        report_exception(chatbot, history, 
 | 
			
		||||
            a = f"解析项目: {txt}", 
 | 
			
		||||
            b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 清空历史,以免输入溢出
 | 
			
		||||
    history = []
 | 
			
		||||
 | 
			
		||||
    # 检测输入参数,如没有给定输入参数,直接退出
 | 
			
		||||
    if os.path.exists(txt):
 | 
			
		||||
        project_folder = txt
 | 
			
		||||
    else:
 | 
			
		||||
        if txt == "": txt = '空空如也的输入栏'
 | 
			
		||||
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 搜索需要处理的文件清单
 | 
			
		||||
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]
 | 
			
		||||
    
 | 
			
		||||
    # 如果没找到任何文件
 | 
			
		||||
    if len(file_manifest) == 0:
 | 
			
		||||
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
 | 
			
		||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # 开始正式执行任务
 | 
			
		||||
    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user