From 8ac9b454e34b54019c3b2c65687fc21e4a192dd1 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Thu, 13 Apr 2023 11:08:53 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E5=96=84chatpdf=E7=9A=84=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functional.py | 7 - crazy_functions/crazy_utils.py | 168 +++++++++++++++++++++ crazy_functions/批量翻译PDF文档_多线程.py | 167 +------------------- crazy_functions/理解PDF文档内容.py | 176 ++++++---------------- version | 4 +- 5 files changed, 221 insertions(+), 301 deletions(-) diff --git a/crazy_functional.py b/crazy_functional.py index 2dcbf93..fbd238f 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -76,7 +76,6 @@ def get_crazy_functions(): from crazy_functions.总结word文档 import 总结word文档 from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档 from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 - from crazy_functions.理解PDF文档内容 import 理解PDF文档内容 from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入 from crazy_functions.Latex全文润色 import Latex中文润色 from crazy_functions.Latex全文翻译 import Latex中译英 @@ -108,11 +107,6 @@ def get_crazy_functions(): "Color": "stop", "Function": HotReload(总结word文档) }, - # "[测试功能] 理解PDF文档内容(Tk文件选择接口,仅本地)": { - # # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效 - # "AsButton": False, # 加入下拉菜单中 - # "Function": HotReload(理解PDF文档内容) - # }, "[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": { # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效 "Color": "stop", @@ -131,7 +125,6 @@ def get_crazy_functions(): "AsButton": False, # 加入下拉菜单中 "Function": HotReload(Latex中文润色) }, - "[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": { # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效 "Color": "stop", diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 9e96e58..253e167 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit): # 这个中文的句号是故意的,作为一个标识而存在 res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) return [r.replace('。\n', '.') for r in res] + + + +def read_and_clean_pdf_text(fp): + """ + 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好 + + **输入参数说明** + - `fp`:需要读取和清理文本的pdf文件路径 + + **输出参数说明** + - `meta_txt`:清理后的文本内容字符串 + - `page_one_meta`:第一页清理后的文本内容列表 + + **函数功能** + 读取pdf文件并清理其中的文本内容,清理规则包括: + - 提取所有块元的文本信息,并合并为一个字符串 + - 去除短块(字符数小于100)并替换为回车符 + - 清理多余的空行 + - 合并小写字母开头的段落块并替换为空格 + - 清除重复的换行 + - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔 + """ + import fitz, copy + import re + import numpy as np + from colorful import print亮黄, print亮绿 + fc = 0 + fs = 1 + fb = 2 + REMOVE_FOOT_NOTE = True + REMOVE_FOOT_FFSIZE_PERCENT = 0.95 + def primary_ffsize(l): + fsize_statiscs = {} + for wtf in l['spans']: + if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0 + fsize_statiscs[wtf['size']] += len(wtf['text']) + return max(fsize_statiscs, key=fsize_statiscs.get) + + def ffsize_same(a,b): + return abs((a-b)/max(a,b)) < 0.02 + # file_content = "" + with fitz.open(fp) as doc: + meta_txt = [] + meta_font = [] + + meta_line = [] + meta_span = [] + for index, page in enumerate(doc): + # file_content += page.get_text() + text_areas = page.get_text("dict") # 获取页面上的文本信息 + for t in text_areas['blocks']: + if 'lines' in t: + pf = 998 + for l in t['lines']: + txt_line = "".join([wtf['text'] for wtf in l['spans']]) + pf = primary_ffsize(l) + meta_line.append([txt_line, pf, l['bbox'], l]) + for wtf in l['spans']: # for l in t['lines']: + meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])]) + # meta_line.append(["NEW_BLOCK", pf]) + # 块元提取 for each word segment with in line for each line cross-line words for each block + meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( + '- ', '') for t in text_areas['blocks'] if 'lines' in t]) + meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']]) + for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t]) + if index == 0: + page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( + '- ', '') for t in text_areas['blocks'] if 'lines' in t] + # 获取正文主字体 + fsize_statiscs = {} + for span in meta_span: + if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 + fsize_statiscs[span[1]] += span[2] + main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) + if REMOVE_FOOT_NOTE: + give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT + + # 切分和重新整合 + mega_sec = [] + sec = [] + for index, line in enumerate(meta_line): + if index == 0: + sec.append(line[fc]) + continue + if REMOVE_FOOT_NOTE: + if meta_line[index][fs] <= give_up_fize_threshold: + continue + if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]): + # 尝试识别段落 + if meta_line[index][fc].endswith('.') and\ + (meta_line[index-1][fc] != 'NEW_BLOCK') and \ + (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7: + sec[-1] += line[fc] + sec[-1] += "\n\n" + else: + sec[-1] += " " + sec[-1] += line[fc] + else: + if (index+1 < len(meta_line)) and \ + meta_line[index][fs] > main_fsize: + # 单行 + 字体大 + mega_sec.append(copy.deepcopy(sec)) + sec = [] + sec.append("# " + line[fc]) + else: + # 尝试识别section + if meta_line[index-1][fs] > meta_line[index][fs]: + sec.append("\n" + line[fc]) + else: + sec.append(line[fc]) + mega_sec.append(copy.deepcopy(sec)) + + finals = [] + for ms in mega_sec: + final = " ".join(ms) + final = final.replace('- ', ' ') + finals.append(final) + meta_txt = finals + + def 把字符太少的块清除为回车(meta_txt): + for index, block_txt in enumerate(meta_txt): + if len(block_txt) < 100: + meta_txt[index] = '\n' + return meta_txt + meta_txt = 把字符太少的块清除为回车(meta_txt) + + def 清理多余的空行(meta_txt): + for index in reversed(range(1, len(meta_txt))): + if meta_txt[index] == '\n' and meta_txt[index-1] == '\n': + meta_txt.pop(index) + return meta_txt + meta_txt = 清理多余的空行(meta_txt) + + def 合并小写开头的段落块(meta_txt): + def starts_with_lowercase_word(s): + pattern = r"^[a-z]+" + match = re.match(pattern, s) + if match: + return True + else: + return False + for _ in range(100): + for index, block_txt in enumerate(meta_txt): + if starts_with_lowercase_word(block_txt): + if meta_txt[index-1] != '\n': + meta_txt[index-1] += ' ' + else: + meta_txt[index-1] = '' + meta_txt[index-1] += meta_txt[index] + meta_txt[index] = '\n' + return meta_txt + meta_txt = 合并小写开头的段落块(meta_txt) + meta_txt = 清理多余的空行(meta_txt) + + meta_txt = '\n'.join(meta_txt) + # 清除重复的换行 + for _ in range(5): + meta_txt = meta_txt.replace('\n\n', '\n') + + # 换行 -> 双换行 + meta_txt = meta_txt.replace('\n', '\n\n') + + for f in finals: + print亮黄(f) + print亮绿('***************************') + + return meta_txt, page_one_meta diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 244a4e1..0aa0b3c 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -2,174 +2,9 @@ from toolbox import CatchException, report_execption, write_results_to_file from toolbox import update_ui from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from .crazy_utils import read_and_clean_pdf_text from colorful import * -def read_and_clean_pdf_text(fp): - """ - 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好,不建议任何人去读这个函数 - - **输入参数说明** - - `fp`:需要读取和清理文本的pdf文件路径 - - **输出参数说明** - - `meta_txt`:清理后的文本内容字符串 - - `page_one_meta`:第一页清理后的文本内容列表 - - **函数功能** - 读取pdf文件并清理其中的文本内容,清理规则包括: - - 提取所有块元的文本信息,并合并为一个字符串 - - 去除短块(字符数小于100)并替换为回车符 - - 清理多余的空行 - - 合并小写字母开头的段落块并替换为空格 - - 清除重复的换行 - - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔 - """ - import fitz, copy - import re - import numpy as np - fc = 0 - fs = 1 - fb = 2 - REMOVE_FOOT_NOTE = True - REMOVE_FOOT_FFSIZE_PERCENT = 0.95 - def primary_ffsize(l): - fsize_statiscs = {} - for wtf in l['spans']: - if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0 - fsize_statiscs[wtf['size']] += len(wtf['text']) - return max(fsize_statiscs, key=fsize_statiscs.get) - - def ffsize_same(a,b): - return abs((a-b)/max(a,b)) < 0.02 - # file_content = "" - with fitz.open(fp) as doc: - meta_txt = [] - meta_font = [] - - meta_line = [] - meta_span = [] - for index, page in enumerate(doc): - # file_content += page.get_text() - text_areas = page.get_text("dict") # 获取页面上的文本信息 - for t in text_areas['blocks']: - if 'lines' in t: - pf = 998 - for l in t['lines']: - txt_line = "".join([wtf['text'] for wtf in l['spans']]) - pf = primary_ffsize(l) - meta_line.append([txt_line, pf, l['bbox'], l]) - for wtf in l['spans']: # for l in t['lines']: - meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])]) - # meta_line.append(["NEW_BLOCK", pf]) - # 块元提取 for each word segment with in line for each line cross-line words for each block - meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( - '- ', '') for t in text_areas['blocks'] if 'lines' in t]) - meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']]) - for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t]) - if index == 0: - page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( - '- ', '') for t in text_areas['blocks'] if 'lines' in t] - # 获取正文主字体 - fsize_statiscs = {} - for span in meta_span: - if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 - fsize_statiscs[span[1]] += span[2] - main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) - if REMOVE_FOOT_NOTE: - give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT - - # 切分和重新整合 - mega_sec = [] - sec = [] - for index, line in enumerate(meta_line): - if index == 0: - sec.append(line[fc]) - continue - if REMOVE_FOOT_NOTE: - if meta_line[index][fs] <= give_up_fize_threshold: - continue - if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]): - # 尝试识别段落 - if meta_line[index][fc].endswith('.') and\ - (meta_line[index-1][fc] != 'NEW_BLOCK') and \ - (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7: - sec[-1] += line[fc] - sec[-1] += "\n\n" - else: - sec[-1] += " " - sec[-1] += line[fc] - else: - if (index+1 < len(meta_line)) and \ - meta_line[index][fs] > main_fsize: - # 单行 + 字体大 - mega_sec.append(copy.deepcopy(sec)) - sec = [] - sec.append("# " + line[fc]) - else: - # 尝试识别section - if meta_line[index-1][fs] > meta_line[index][fs]: - sec.append("\n" + line[fc]) - else: - sec.append(line[fc]) - mega_sec.append(copy.deepcopy(sec)) - - finals = [] - for ms in mega_sec: - final = " ".join(ms) - final = final.replace('- ', ' ') - finals.append(final) - meta_txt = finals - - def 把字符太少的块清除为回车(meta_txt): - for index, block_txt in enumerate(meta_txt): - if len(block_txt) < 100: - meta_txt[index] = '\n' - return meta_txt - meta_txt = 把字符太少的块清除为回车(meta_txt) - - def 清理多余的空行(meta_txt): - for index in reversed(range(1, len(meta_txt))): - if meta_txt[index] == '\n' and meta_txt[index-1] == '\n': - meta_txt.pop(index) - return meta_txt - meta_txt = 清理多余的空行(meta_txt) - - def 合并小写开头的段落块(meta_txt): - def starts_with_lowercase_word(s): - pattern = r"^[a-z]+" - match = re.match(pattern, s) - if match: - return True - else: - return False - for _ in range(100): - for index, block_txt in enumerate(meta_txt): - if starts_with_lowercase_word(block_txt): - if meta_txt[index-1] != '\n': - meta_txt[index-1] += ' ' - else: - meta_txt[index-1] = '' - meta_txt[index-1] += meta_txt[index] - meta_txt[index] = '\n' - return meta_txt - meta_txt = 合并小写开头的段落块(meta_txt) - meta_txt = 清理多余的空行(meta_txt) - - meta_txt = '\n'.join(meta_txt) - # 清除重复的换行 - for _ in range(5): - meta_txt = meta_txt.replace('\n\n', '\n') - - # 换行 -> 双换行 - meta_txt = meta_txt.replace('\n', '\n\n') - - for f in finals: - print亮黄(f) - print亮绿('***************************') - - return meta_txt, page_one_meta - - @CatchException def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port): import glob diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py index a9278e8..fada6b0 100644 --- a/crazy_functions/理解PDF文档内容.py +++ b/crazy_functions/理解PDF文档内容.py @@ -1,142 +1,66 @@ from toolbox import update_ui from toolbox import CatchException, report_execption -import re -import unicodedata +from .crazy_utils import read_and_clean_pdf_text from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive fast_debug = False -def is_paragraph_break(match): - """ - 根据给定的匹配结果来判断换行符是否表示段落分隔。 - 如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。 - 也可以根据之前的内容长度来判断段落是否已经足够长。 - """ - prev_char, next_char = match.groups() - - # 句子结束标志 - sentence_endings = ".!?" - - # 设定一个最小段落长度阈值 - min_paragraph_length = 140 - - if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length: - return "\n\n" - else: - return " " - -def normalize_text(text): - """ - 通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。 - 例如,将连字 "fi" 转换为 "f" 和 "i"。 - """ - # 对文本进行归一化处理,分解连字 - normalized_text = unicodedata.normalize("NFKD", text) - - # 替换其他特殊字符 - cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text) - - return cleaned_text - -def clean_text(raw_text): - """ - 对从 PDF 提取出的原始文本进行清洗和格式化处理。 - 1. 对原始文本进行归一化处理。 - 2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。 - 3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。 - """ - # 对文本进行归一化处理 - normalized_text = normalize_text(raw_text) - - # 替换跨行的连词 - text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text) - - # 根据前后相邻字符的特点,找到原文本中的换行符 - newlines = re.compile(r'(\S)\n(\S)') - - # 根据 heuristic 规则,用空格或段落分隔符替换原换行符 - final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text) - - return final_text.strip() def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): - import time, glob, os, fitz + import tiktoken print('begin analysis on:', file_name) + file_content, page_one = read_and_clean_pdf_text(file_name) - with fitz.open(file_name) as doc: - file_content = "" - for page in doc: - file_content += page.get_text() - file_content = clean_text(file_content) - # print(file_content) - split_number = 10000 - split_group = (len(file_content)//split_number)+1 - for i in range(0,split_group): - if i==0: - prefix = "接下来请你仔细分析下面的论文,学习里面的内容(专业术语、公式、数学概念).并且注意:由于论文内容较多,将分批次发送,每次发送完之后,你只需要回答“接受完成”" - i_say = prefix + f'文件名是{file_name},文章内容第{i+1}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```' - i_say_show_user = f'文件名是:\n{file_name},\n由于论文内容过长,将分批请求(共{len(file_content)}字符,将分为{split_group}批,每批{split_number}字符)。\n当前发送{i+1}/{split_group}部分' - elif i==split_group-1: - i_say = f'你只需要回答“所有论文接受完成,请进行下一步”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:]}```' - i_say_show_user = f'当前发送{i+1}/{split_group}部分' - else: - i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```' - i_say_show_user = f'当前发送{i+1}/{split_group}部分' - chatbot.append((i_say_show_user, "[Local Message] waiting gpt response.")) - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="") # 带超时倒计时 - while "完成" not in gpt_say: - i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```' - i_say_show_user = f'出现error,重新发送{i+1}/{split_group}部分' - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="") # 带超时倒计时 - time.sleep(1) - chatbot[-1] = (i_say_show_user, gpt_say) - history.append(i_say_show_user); history.append(gpt_say) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - time.sleep(2) + # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割) + # 的长度必须小于 2500 个 Token + TOKEN_LIMIT_PER_FRAGMENT = 2500 - i_say = f'接下来,请你扮演一名专业的学术教授,利用你的所有知识并且结合这篇文章,回答我的问题。(请牢记:1.直到我说“退出”,你才能结束任务;2.所有问题需要紧密围绕文章内容;3.如果有公式,请使用tex渲染)' - chatbot.append((i_say, "[Local Message] waiting gpt response.")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from toolbox import get_conf + enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL')) + def get_token_num(txt): return len(enc.encode(txt)) + paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( + txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) + page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( + txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + # 为了更好的效果,我们剥离Introduction之后的部分(如果有) + paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] + + ############################## <第一步,从摘要中提取高价值信息,放到history中> ################################## + final_results = [] + final_results.append(paper_meta) - # ** gpt request ** - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say, llm_kwargs, chatbot, history=history, sys_prompt="") # 带超时倒计时 - chatbot[-1] = (i_say, gpt_say) - history.append(i_say); history.append(gpt_say) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + ############################## <第二步,迭代地历遍整个文章,提取精炼信息> ################################## + i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示 + chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI + iteration_results = [] + last_iteration_result = paper_meta # 初始值是摘要 + MAX_WORD_TOTAL = 4096 + n_fragment = len(paper_fragments) + if n_fragment >= 20: print('文章极长,不能达到预期效果') + for i in range(n_fragment): + NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment + i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}" + i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}" + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问 + llm_kwargs, chatbot, + history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果 + sys_prompt="Extract the main idea of this section." # 提示 + ) + iteration_results.append(gpt_say) + last_iteration_result = gpt_say -@CatchException -def 理解PDF文档内容(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - import glob, os - - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - import tkinter as tk - from tkinter import filedialog - - root = tk.Tk() - root.withdraw() - txt = filedialog.askopenfilename() - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import fitz - except: - report_execption(chatbot, history, - a = f"解析项目: {txt}", - b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 清空历史,以免输入溢出 - history = [] - - # 开始正式执行任务 - yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + ############################## <第三步,整理history> ################################## + final_results.extend(iteration_results) + final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。') + # 接下来两句话只显示在界面上,不起实际作用 + i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。" + chatbot.append([i_say_show_user, gpt_say]) + ############################## <第四步,设置一个token上限,防止回答时Token溢出> ################################## + from .crazy_utils import input_clipping + _, final_results = input_clipping("", final_results, max_token_limit=3200) + yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了 @CatchException @@ -146,7 +70,7 @@ def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chat # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", - "理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe。"]) + "理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe, binary-husky"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 尝试导入依赖,如果缺少依赖,则给出安装建议 diff --git a/version b/version index d1ed976..393195c 100644 --- a/version +++ b/version @@ -1,5 +1,5 @@ { - "version": 2.67, + "version": 2.68, "show_feature": true, - "new_feature": "现可通过输入区更新临时api-key <-> 增强多线程稳定性(涉及代码解析、PDF翻译、自译解等) <-> 修复Token计数错误(解决PDF翻译的分割不合理的问题) <-> 如果一键更新失败,可前往github手动更新" + "new_feature": "改善理解pdf(chatpdf)功能 <-> 如果一键更新失败,可前往github手动更新" }