完善PDF总结插件
This commit is contained in:
		
							parent
							
								
									fcc5534e66
								
							
						
					
					
						commit
						b298c5416c
					
				@ -1,121 +1,107 @@
 | 
				
			|||||||
from toolbox import update_ui
 | 
					from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str
 | 
				
			||||||
from toolbox import CatchException, report_execption, write_results_to_file
 | 
					from toolbox import CatchException, report_execption, write_results_to_file
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
import unicodedata
 | 
					 | 
				
			||||||
fast_debug = False
 | 
					 | 
				
			||||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 | 
					from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 | 
				
			||||||
 | 
					from .crazy_utils import read_and_clean_pdf_text
 | 
				
			||||||
 | 
					from .crazy_utils import input_clipping
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_paragraph_break(match):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    根据给定的匹配结果来判断换行符是否表示段落分隔。
 | 
					 | 
				
			||||||
    如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
 | 
					 | 
				
			||||||
    也可以根据之前的内容长度来判断段落是否已经足够长。
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    prev_char, next_char = match.groups()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 句子结束标志
 | 
					 | 
				
			||||||
    sentence_endings = ".!?"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # 设定一个最小段落长度阈值
 | 
					 | 
				
			||||||
    min_paragraph_length = 140
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
 | 
					 | 
				
			||||||
        return "\n\n" 
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return " "
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def normalize_text(text):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
 | 
					 | 
				
			||||||
    例如,将连字 "fi" 转换为 "f" 和 "i"。
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # 对文本进行归一化处理,分解连字
 | 
					 | 
				
			||||||
    normalized_text = unicodedata.normalize("NFKD", text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # 替换其他特殊字符
 | 
					 | 
				
			||||||
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return cleaned_text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def clean_text(raw_text):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    对从 PDF 提取出的原始文本进行清洗和格式化处理。
 | 
					 | 
				
			||||||
    1. 对原始文本进行归一化处理。
 | 
					 | 
				
			||||||
    2. 替换跨行的连词
 | 
					 | 
				
			||||||
    3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # 对文本进行归一化处理
 | 
					 | 
				
			||||||
    normalized_text = normalize_text(raw_text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # 替换跨行的连词
 | 
					 | 
				
			||||||
    text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # 根据前后相邻字符的特点,找到原文本中的换行符
 | 
					 | 
				
			||||||
    newlines = re.compile(r'(\S)\n(\S)')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # 根据 heuristic 规则,用空格或段落分隔符替换原换行符
 | 
					 | 
				
			||||||
    final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return final_text.strip()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
 | 
					def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
 | 
				
			||||||
    import time, glob, os, fitz
 | 
					    file_write_buffer = []
 | 
				
			||||||
    print('begin analysis on:', file_manifest)
 | 
					    for file_name in file_manifest:
 | 
				
			||||||
    for index, fp in enumerate(file_manifest):
 | 
					        print('begin analysis on:', file_name)
 | 
				
			||||||
        with fitz.open(fp) as doc:
 | 
					        ############################## <第 0 步,切割PDF> ##################################
 | 
				
			||||||
            file_content = ""
 | 
					        # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
 | 
				
			||||||
            for page in doc:
 | 
					        # 的长度必须小于 2500 个 Token
 | 
				
			||||||
                file_content += page.get_text()
 | 
					        file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF
 | 
				
			||||||
            file_content = clean_text(file_content)
 | 
					        file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
 | 
				
			||||||
            print(file_content)
 | 
					        page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
 | 
					        TOKEN_LIMIT_PER_FRAGMENT = 2500
 | 
				
			||||||
        i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
 | 
					 | 
				
			||||||
        i_say_show_user = prefix + f'[{index + 1}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
 | 
					 | 
				
			||||||
        chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
 | 
					 | 
				
			||||||
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not fast_debug: 
 | 
					        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
 | 
				
			||||||
            msg = '正常'
 | 
					        from request_llm.bridge_all import model_info
 | 
				
			||||||
            # ** gpt request **
 | 
					        enc = model_info["gpt-3.5-turbo"]['tokenizer']
 | 
				
			||||||
 | 
					        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
 | 
				
			||||||
 | 
					        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
				
			||||||
 | 
					            txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
 | 
				
			||||||
 | 
					        page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 | 
				
			||||||
 | 
					            txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
 | 
				
			||||||
 | 
					        # 为了更好的效果,我们剥离Introduction之后的部分(如果有)
 | 
				
			||||||
 | 
					        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        ############################## <第 1 步,从摘要中提取高价值信息,放到history中> ##################################
 | 
				
			||||||
 | 
					        final_results = []
 | 
				
			||||||
 | 
					        final_results.append(paper_meta)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ##################################
 | 
				
			||||||
 | 
					        i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
 | 
				
			||||||
 | 
					        chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        iteration_results = []
 | 
				
			||||||
 | 
					        last_iteration_result = paper_meta  # 初始值是摘要
 | 
				
			||||||
 | 
					        MAX_WORD_TOTAL = 4096 * 0.7
 | 
				
			||||||
 | 
					        n_fragment = len(paper_fragments)
 | 
				
			||||||
 | 
					        if n_fragment >= 20: print('文章极长,不能达到预期效果')
 | 
				
			||||||
 | 
					        for i in range(n_fragment):
 | 
				
			||||||
 | 
					            NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
 | 
				
			||||||
 | 
					            i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
 | 
				
			||||||
 | 
					            i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
 | 
				
			||||||
 | 
					            gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
 | 
				
			||||||
 | 
					                                                                                llm_kwargs, chatbot, 
 | 
				
			||||||
 | 
					                                                                                history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
 | 
				
			||||||
 | 
					                                                                                sys_prompt="Extract the main idea of this section with Chinese."  # 提示
 | 
				
			||||||
 | 
					                                                                                ) 
 | 
				
			||||||
 | 
					            iteration_results.append(gpt_say)
 | 
				
			||||||
 | 
					            last_iteration_result = gpt_say
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ############################## <第 3 步,整理history,提取总结> ##################################
 | 
				
			||||||
 | 
					        final_results.extend(iteration_results)
 | 
				
			||||||
 | 
					        final_results.append(f'Please conclude this paper discussed above。')
 | 
				
			||||||
 | 
					        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
 | 
				
			||||||
 | 
					        NUM_OF_WORD = 1000
 | 
				
			||||||
 | 
					        i_say = """
 | 
				
			||||||
 | 
					1. Mark the title of the paper (with Chinese translation)
 | 
				
			||||||
 | 
					2. list all the authors' names (use English)
 | 
				
			||||||
 | 
					3. mark the first author's affiliation (output Chinese translation only)
 | 
				
			||||||
 | 
					4. mark the keywords of this article (use English)
 | 
				
			||||||
 | 
					5. link to the paper, Github code link (if available, fill in Github:None if not)
 | 
				
			||||||
 | 
					6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English)
 | 
				
			||||||
 | 
					    - (1):What is the research background of this article?
 | 
				
			||||||
 | 
					    - (2):What are the past methods? What are the problems with them? Is the approach well motivated?
 | 
				
			||||||
 | 
					    - (3):What is the research methodology proposed in this paper?
 | 
				
			||||||
 | 
					    - (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
 | 
				
			||||||
 | 
					Follow the format of the output that follows:                  
 | 
				
			||||||
 | 
					1. Title: xxx\n\n
 | 
				
			||||||
 | 
					2. Authors: xxx\n\n
 | 
				
			||||||
 | 
					3. Affiliation: xxx\n\n
 | 
				
			||||||
 | 
					4. Keywords: xxx\n\n
 | 
				
			||||||
 | 
					5. Urls: xxx or xxx , xxx \n\n
 | 
				
			||||||
 | 
					6. Summary: \n\n
 | 
				
			||||||
 | 
					    - (1):xxx;\n 
 | 
				
			||||||
 | 
					    - (2):xxx;\n 
 | 
				
			||||||
 | 
					    - (3):xxx;\n
 | 
				
			||||||
 | 
					    - (4):xxx.\n\n
 | 
				
			||||||
 | 
					Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible,
 | 
				
			||||||
 | 
					do not have too much repetitive information, numerical values using the original numbers.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
 | 
				
			||||||
 | 
					        file_write_buffer.extend(final_results)
 | 
				
			||||||
 | 
					        i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000)
 | 
				
			||||||
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
 | 
					        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
 | 
				
			||||||
                inputs=i_say, 
 | 
					            inputs=i_say, inputs_show_user='开始最终总结', 
 | 
				
			||||||
                inputs_show_user=i_say_show_user, 
 | 
					            llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results, 
 | 
				
			||||||
                llm_kwargs=llm_kwargs,
 | 
					            sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters"
 | 
				
			||||||
                chatbot=chatbot, 
 | 
					        )
 | 
				
			||||||
                history=[],
 | 
					        final_results.append(gpt_say)
 | 
				
			||||||
                sys_prompt="总结文章。"
 | 
					        file_write_buffer.extend([i_say, gpt_say])
 | 
				
			||||||
            )  # 带超时倒计时
 | 
					        ############################## <第 4 步,设置一个token上限> ##################################
 | 
				
			||||||
 | 
					        _, final_results = input_clipping("", final_results, max_token_limit=3200)
 | 
				
			||||||
 | 
					        yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    res = write_results_to_file(file_write_buffer, file_name=gen_time_str())
 | 
				
			||||||
            chatbot[-1] = (i_say_show_user, gpt_say)
 | 
					    promote_file_to_downloadzone(res.split('\t')[-1], chatbot=chatbot)
 | 
				
			||||||
            history.append(i_say_show_user); history.append(gpt_say)
 | 
					    yield from update_ui(chatbot=chatbot, history=final_results) # 刷新界面
 | 
				
			||||||
            yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
 | 
					 | 
				
			||||||
            if not fast_debug: time.sleep(2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    all_file = ', '.join([os.path.relpath(fp, project_folder) for index, fp in enumerate(file_manifest)])
 | 
					 | 
				
			||||||
    i_say = f'根据以上你自己的分析,对全文进行概括,用学术性语言写一段中文摘要,然后再写一段英文摘要(包括{all_file})。'
 | 
					 | 
				
			||||||
    chatbot.append((i_say, "[Local Message] waiting gpt response."))
 | 
					 | 
				
			||||||
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not fast_debug: 
 | 
					 | 
				
			||||||
        msg = '正常'
 | 
					 | 
				
			||||||
        # ** gpt request **
 | 
					 | 
				
			||||||
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
 | 
					 | 
				
			||||||
            inputs=i_say, 
 | 
					 | 
				
			||||||
            inputs_show_user=i_say, 
 | 
					 | 
				
			||||||
            llm_kwargs=llm_kwargs,
 | 
					 | 
				
			||||||
            chatbot=chatbot, 
 | 
					 | 
				
			||||||
            history=history,
 | 
					 | 
				
			||||||
            sys_prompt="总结文章。"
 | 
					 | 
				
			||||||
        )  # 带超时倒计时
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        chatbot[-1] = (i_say, gpt_say)
 | 
					 | 
				
			||||||
        history.append(i_say); history.append(gpt_say)
 | 
					 | 
				
			||||||
        yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
 | 
					 | 
				
			||||||
        res = write_results_to_file(history)
 | 
					 | 
				
			||||||
        chatbot.append(("完成了吗?", res))
 | 
					 | 
				
			||||||
        yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@CatchException
 | 
					@CatchException
 | 
				
			||||||
@ -151,10 +137,7 @@ def 批量总结PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
 | 
				
			|||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 搜索需要处理的文件清单
 | 
					    # 搜索需要处理的文件清单
 | 
				
			||||||
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
 | 
					    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]
 | 
				
			||||||
                    # [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
 | 
					 | 
				
			||||||
                    # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
 | 
					 | 
				
			||||||
                    # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    # 如果没找到任何文件
 | 
					    # 如果没找到任何文件
 | 
				
			||||||
    if len(file_manifest) == 0:
 | 
					    if len(file_manifest) == 0:
 | 
				
			||||||
 | 
				
			|||||||
@ -214,7 +214,7 @@ def write_results_to_file(history, file_name=None):
 | 
				
			|||||||
                # remove everything that cannot be handled by utf8
 | 
					                # remove everything that cannot be handled by utf8
 | 
				
			||||||
                f.write(content.encode('utf-8', 'ignore').decode())
 | 
					                f.write(content.encode('utf-8', 'ignore').decode())
 | 
				
			||||||
            f.write('\n\n')
 | 
					            f.write('\n\n')
 | 
				
			||||||
    res = '以上材料已经被写入' + os.path.abspath(f'./gpt_log/{file_name}')
 | 
					    res = '以上材料已经被写入:\t' + os.path.abspath(f'./gpt_log/{file_name}')
 | 
				
			||||||
    print(res)
 | 
					    print(res)
 | 
				
			||||||
    return res
 | 
					    return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -467,8 +467,11 @@ def promote_file_to_downloadzone(file, rename_file=None, chatbot=None):
 | 
				
			|||||||
    import shutil
 | 
					    import shutil
 | 
				
			||||||
    if rename_file is None: rename_file = f'{gen_time_str()}-{os.path.basename(file)}'
 | 
					    if rename_file is None: rename_file = f'{gen_time_str()}-{os.path.basename(file)}'
 | 
				
			||||||
    new_path = os.path.join(f'./gpt_log/', rename_file)
 | 
					    new_path = os.path.join(f'./gpt_log/', rename_file)
 | 
				
			||||||
 | 
					    # 如果已经存在,先删除
 | 
				
			||||||
    if os.path.exists(new_path) and not os.path.samefile(new_path, file): os.remove(new_path)
 | 
					    if os.path.exists(new_path) and not os.path.samefile(new_path, file): os.remove(new_path)
 | 
				
			||||||
 | 
					    # 把文件复制过去
 | 
				
			||||||
    if not os.path.exists(new_path): shutil.copyfile(file, new_path)
 | 
					    if not os.path.exists(new_path): shutil.copyfile(file, new_path)
 | 
				
			||||||
 | 
					    # 将文件添加到chatbot cookie中,避免多用户干扰
 | 
				
			||||||
    if chatbot:
 | 
					    if chatbot:
 | 
				
			||||||
        if 'file_to_promote' in chatbot._cookies: current = chatbot._cookies['file_to_promote']
 | 
					        if 'file_to_promote' in chatbot._cookies: current = chatbot._cookies['file_to_promote']
 | 
				
			||||||
        else: current = []
 | 
					        else: current = []
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user