完善PDF总结插件

2023-07-09 16:01:08 +08:00 · 2023-07-09 16:01:08 +08:00 · b298c5416c
commit b298c5416c
parent fcc5534e66
2 changed files with 97 additions and 111 deletions
--- a/crazy_functions/批量总结PDF文档.py
+++ b/crazy_functions/批量总结PDF文档.py
@ -1,121 +1,107 @@
-from toolbox import update_ui
+from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str
 from toolbox import CatchException, report_execption, write_results_to_file
 import re
 import unicodedata
 fast_debug = False
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import read_and_clean_pdf_text
 from .crazy_utils import input_clipping
 def is_paragraph_break(match):
    """
    根据给定的匹配结果来判断换行符是否表示段落分隔。
    如果换行符前为句子结束标志（句号，感叹号，问号），且下一个字符为大写字母，则换行符更有可能表示段落分隔。
    也可以根据之前的内容长度来判断段落是否已经足够长。
    """
    prev_char, next_char = match.groups()
    # 句子结束标志
    sentence_endings = ".!?"
    # 设定一个最小段落长度阈值
    min_paragraph_length = 140
    if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
        return "\n\n" 
    else:
        return " "
 def normalize_text(text):
    """
    通过把连字（ligatures）等文本特殊符号转换为其基本形式来对文本进行归一化处理。
    例如，将连字 "fi" 转换为 "f" 和 "i"。
    """
    # 对文本进行归一化处理，分解连字
    normalized_text = unicodedata.normalize("NFKD", text)
    # 替换其他特殊字符
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
    return cleaned_text
 def clean_text(raw_text):
    """
    对从 PDF 提取出的原始文本进行清洗和格式化处理。
    1. 对原始文本进行归一化处理。
    2. 替换跨行的连词
    3. 根据 heuristic 规则判断换行符是否是段落分隔，并相应地进行替换
    """
    # 对文本进行归一化处理
    normalized_text = normalize_text(raw_text)
    # 替换跨行的连词
    text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
    # 根据前后相邻字符的特点，找到原文本中的换行符
    newlines = re.compile(r'(\S)\n(\S)')
    # 根据 heuristic 规则，用空格或段落分隔符替换原换行符
    final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
    return final_text.strip()
 def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
-    import time, glob, os, fitz
+    file_write_buffer = []
-    print('begin analysis on:', file_manifest)
+    for file_name in file_manifest:
-    for index, fp in enumerate(file_manifest):
+        print('begin analysis on:', file_name)
-        with fitz.open(fp) as doc:
+        ############################## <第 0 步，切割PDF> ##################################
-            file_content = ""
+        # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
-            for page in doc:
+        # 的长度必须小于 2500 个 Token
-                file_content += page.get_text()
+        file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
-            file_content = clean_text(file_content)
+        file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
-            print(file_content)
+        page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
-        prefix = "接下来请你逐文件分析下面的论文文件，概括其内容" if index==0 else ""
+        TOKEN_LIMIT_PER_FRAGMENT = 2500
        i_say = prefix + f'请对下面的文章片段用中文做一个概述，文件名是{os.path.relpath(fp, project_folder)}，文章内容是 ```{file_content}```'
        i_say_show_user = prefix + f'[{index + 1}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
        chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-        if not fast_debug: 
+        from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
-            msg = '正常'
+        from request_llm.bridge_all import model_info
-            # ** gpt request **
+        enc = model_info["gpt-3.5-turbo"]['tokenizer']
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
        paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
            txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
        page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
            txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
        # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
        ############################## <第 1 步，从摘要中提取高价值信息，放到history中> ##################################
        final_results = []
        final_results.append(paper_meta)
        ############################## <第 2 步，迭代地历遍整个文章，提取精炼信息> ##################################
        i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
        chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
        iteration_results = []
        last_iteration_result = paper_meta  # 初始值是摘要
        MAX_WORD_TOTAL = 4096 * 0.7
        n_fragment = len(paper_fragments)
        if n_fragment >= 20: print('文章极长，不能达到预期效果')
        for i in range(n_fragment):
            NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
            i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
            i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
            gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问， i_say_show_user=给用户看的提问
                                                                                llm_kwargs, chatbot, 
                                                                                history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
                                                                                sys_prompt="Extract the main idea of this section with Chinese."  # 提示
                                                                                ) 
            iteration_results.append(gpt_say)
            last_iteration_result = gpt_say
        ############################## <第 3 步，整理history，提取总结> ##################################
        final_results.extend(iteration_results)
        final_results.append(f'Please conclude this paper discussed above。')
        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
        NUM_OF_WORD = 1000
        i_say = """
 1. Mark the title of the paper (with Chinese translation)
 2. list all the authors' names (use English)
 3. mark the first author's affiliation (output Chinese translation only)
 4. mark the keywords of this article (use English)
 5. link to the paper, Github code link (if available, fill in Github:None if not)
 6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English)
    - (1):What is the research background of this article?
    - (2):What are the past methods? What are the problems with them? Is the approach well motivated?
    - (3):What is the research methodology proposed in this paper?
    - (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
 Follow the format of the output that follows:                  
 1. Title: xxx\n\n
 2. Authors: xxx\n\n
 3. Affiliation: xxx\n\n
 4. Keywords: xxx\n\n
 5. Urls: xxx or xxx , xxx \n\n
 6. Summary: \n\n
    - (1):xxx;\n 
    - (2):xxx;\n 
    - (3):xxx;\n
    - (4):xxx.\n\n
 Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible,
 do not have too much repetitive information, numerical values using the original numbers.
        """
        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
        file_write_buffer.extend(final_results)
        i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000)
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
-                inputs=i_say, 
+            inputs=i_say, inputs_show_user='开始最终总结', 
-                inputs_show_user=i_say_show_user, 
+            llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results, 
-                llm_kwargs=llm_kwargs,
+            sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters"
-                chatbot=chatbot, 
+        )
-                history=[],
+        final_results.append(gpt_say)
-                sys_prompt="总结文章。"
+        file_write_buffer.extend([i_say, gpt_say])
-            )  # 带超时倒计时
+        ############################## <第 4 步，设置一个token上限> ##################################
        _, final_results = input_clipping("", final_results, max_token_limit=3200)
        yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
-
+    res = write_results_to_file(file_write_buffer, file_name=gen_time_str())
-            chatbot[-1] = (i_say_show_user, gpt_say)
+    promote_file_to_downloadzone(res.split('\t')[-1], chatbot=chatbot)
-            history.append(i_say_show_user); history.append(gpt_say)
+    yield from update_ui(chatbot=chatbot, history=final_results) # 刷新界面
            yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
            if not fast_debug: time.sleep(2)
    all_file = ', '.join([os.path.relpath(fp, project_folder) for index, fp in enumerate(file_manifest)])
    i_say = f'根据以上你自己的分析，对全文进行概括，用学术性语言写一段中文摘要，然后再写一段英文摘要（包括{all_file}）。'
    chatbot.append((i_say, "[Local Message] waiting gpt response."))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    if not fast_debug: 
        msg = '正常'
        # ** gpt request **
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
            inputs=i_say, 
            inputs_show_user=i_say, 
            llm_kwargs=llm_kwargs,
            chatbot=chatbot, 
            history=history,
            sys_prompt="总结文章。"
        )  # 带超时倒计时
        chatbot[-1] = (i_say, gpt_say)
        history.append(i_say); history.append(gpt_say)
        yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
        res = write_results_to_file(history)
        chatbot.append(("完成了吗？", res))
        yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
@CatchException
@ -151,10 +137,7 @@ def 批量总结PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
        return
    # 搜索需要处理的文件清单
-    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
+    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]
                    # [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
                    # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
                    # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
    # 如果没找到任何文件
    if len(file_manifest) == 0:
--- a/toolbox.py
+++ b/toolbox.py
@ -214,7 +214,7 @@ def write_results_to_file(history, file_name=None):
                # remove everything that cannot be handled by utf8
                f.write(content.encode('utf-8', 'ignore').decode())
            f.write('\n\n')
-    res = '以上材料已经被写入' + os.path.abspath(f'./gpt_log/{file_name}')
+    res = '以上材料已经被写入:\t' + os.path.abspath(f'./gpt_log/{file_name}')
    print(res)
    return res
@ -467,8 +467,11 @@ def promote_file_to_downloadzone(file, rename_file=None, chatbot=None):
    import shutil
    if rename_file is None: rename_file = f'{gen_time_str()}-{os.path.basename(file)}'
    new_path = os.path.join(f'./gpt_log/', rename_file)
    # 如果已经存在，先删除
    if os.path.exists(new_path) and not os.path.samefile(new_path, file): os.remove(new_path)
    # 把文件复制过去
    if not os.path.exists(new_path): shutil.copyfile(file, new_path)
    # 将文件添加到chatbot cookie中，避免多用户干扰
    if chatbot:
        if 'file_to_promote' in chatbot._cookies: current = chatbot._cookies['file_to_promote']
        else: current = []