chatgpt_academic/crazy_functions/理解PDF文档内容.py

115 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from toolbox import update_ui
from toolbox import CatchException, report_exception
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
fast_debug = False
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import tiktoken
print('begin analysis on:', file_name)
############################## <第 0 步切割PDF> ##################################
# 递归地切割PDF文件每一块尽量是完整的一个section比如introductionexperiment等必要时再进行切割
# 的长度必须小于 2500 个 Token
file_content, page_one = read_and_clean_pdf_text(file_name) # 尝试按照章节切割PDF
file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
TOKEN_LIMIT_PER_FRAGMENT = 2500
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from request_llms.bridge_all import model_info
enc = model_info["gpt-3.5-turbo"]['tokenizer']
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
# 为了更好的效果我们剥离Introduction之后的部分如果有
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
############################## <第 1 步从摘要中提取高价值信息放到history中> ##################################
final_results = []
final_results.append(paper_meta)
############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ##################################
i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI
iteration_results = []
last_iteration_result = paper_meta # 初始值是摘要
MAX_WORD_TOTAL = 4096
n_fragment = len(paper_fragments)
if n_fragment >= 20: print('文章极长,不能达到预期效果')
for i in range(n_fragment):
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问 i_say_show_user=给用户看的提问
llm_kwargs, chatbot,
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
sys_prompt="Extract the main idea of this section, answer me with Chinese." # 提示
)
iteration_results.append(gpt_say)
last_iteration_result = gpt_say
############################## <第 3 步整理history> ##################################
final_results.extend(iteration_results)
final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。')
# 接下来两句话只显示在界面上,不起实际作用
i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
chatbot.append([i_say_show_user, gpt_say])
############################## <第 4 步设置一个token上限防止回答时Token溢出> ##################################
from .crazy_utils import input_clipping
_, final_results = input_clipping("", final_results, max_token_limit=3200)
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
@CatchException
def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
import glob, os
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"理解PDF论文内容并且将结合上下文内容进行学术解答。函数插件贡献者: Hanzoe, binary-husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import fitz
except:
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "":
txt = '空空如也的输入栏'
report_exception(chatbot, history,
a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 搜索需要处理的文件清单
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]
# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history,
a=f"解析项目: {txt}", b=f"找不到任何.tex或.pdf文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
txt = file_manifest[0]
# 开始正式执行任务
yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)