From 3c271302cc4b1e06dc1fa8e11ba3bcabce92b018 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Tue, 19 Dec 2023 19:30:44 +0800 Subject: [PATCH] improve long text breakdown perfomance --- crazy_functions/Latex全文润色.py | 4 +- crazy_functions/Latex全文翻译.py | 4 +- crazy_functions/crazy_utils.py | 90 --------------- crazy_functions/ipc_fns/mp.py | 37 ++++++ crazy_functions/latex_fns/latex_actions.py | 11 +- crazy_functions/pdf_fns/breakdown_txt.py | 125 +++++++++++++++++++++ crazy_functions/pdf_fns/parse_pdf.py | 4 +- crazy_functions/总结word文档.py | 8 +- crazy_functions/批量Markdown翻译.py | 4 +- crazy_functions/批量总结PDF文档.py | 11 +- crazy_functions/批量翻译PDF文档_多线程.py | 11 +- crazy_functions/理解PDF文档内容.py | 13 +-- crazy_functions/解析JupyterNotebook.py | 12 +- 13 files changed, 186 insertions(+), 148 deletions(-) create mode 100644 crazy_functions/ipc_fns/mp.py create mode 100644 crazy_functions/pdf_fns/breakdown_txt.py diff --git a/crazy_functions/Latex全文润色.py b/crazy_functions/Latex全文润色.py index 0bc7d40..b736fe8 100644 --- a/crazy_functions/Latex全文润色.py +++ b/crazy_functions/Latex全文润色.py @@ -26,8 +26,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/Latex全文翻译.py b/crazy_functions/Latex全文翻译.py index 846bd80..49470c8 100644 --- a/crazy_functions/Latex全文翻译.py +++ b/crazy_functions/Latex全文翻译.py @@ -26,8 +26,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 9778053..731da1a 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -312,95 +312,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( return gpt_response_collection -def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit): - def cut(txt_tocut, must_break_at_empty_line): # 递归 - if get_token_fn(txt_tocut) <= limit: - return [txt_tocut] - else: - lines = txt_tocut.split('\n') - estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines) - estimated_line_cut = int(estimated_line_cut) - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line: - if lines[cnt] != "": - continue - print(cnt) - prev = "\n".join(lines[:cnt]) - post = "\n".join(lines[cnt:]) - if get_token_fn(prev) < limit: - break - if cnt == 0: - raise RuntimeError("存在一行极长的文本!") - # print(len(post)) - # 列表递归接龙 - result = [prev] - result.extend(cut(post, must_break_at_empty_line)) - return result - try: - return cut(txt, must_break_at_empty_line=True) - except RuntimeError: - return cut(txt, must_break_at_empty_line=False) - - -def force_breakdown(txt, limit, get_token_fn): - """ - 当无法用标点、空行分割时,我们用最暴力的方法切割 - """ - for i in reversed(range(len(txt))): - if get_token_fn(txt[:i]) < limit: - return txt[:i], txt[i:] - return "Tiktoken未知错误", "Tiktoken未知错误" - -def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit): - # 递归 - def cut(txt_tocut, must_break_at_empty_line, break_anyway=False): - if get_token_fn(txt_tocut) <= limit: - return [txt_tocut] - else: - lines = txt_tocut.split('\n') - estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines) - estimated_line_cut = int(estimated_line_cut) - cnt = 0 - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line: - if lines[cnt] != "": - continue - prev = "\n".join(lines[:cnt]) - post = "\n".join(lines[cnt:]) - if get_token_fn(prev) < limit: - break - if cnt == 0: - if break_anyway: - prev, post = force_breakdown(txt_tocut, limit, get_token_fn) - else: - raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") - # print(len(post)) - # 列表递归接龙 - result = [prev] - result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway)) - return result - try: - # 第1次尝试,将双空行(\n\n)作为切分点 - return cut(txt, must_break_at_empty_line=True) - except RuntimeError: - try: - # 第2次尝试,将单空行(\n)作为切分点 - return cut(txt, must_break_at_empty_line=False) - except RuntimeError: - try: - # 第3次尝试,将英文句号(.)作为切分点 - res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 - return [r.replace('。\n', '.') for r in res] - except RuntimeError as e: - try: - # 第4次尝试,将中文句号(。)作为切分点 - res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False) - return [r.replace('。。\n', '。') for r in res] - except RuntimeError as e: - # 第5次尝试,没办法了,随便切一下敷衍吧 - return cut(txt, must_break_at_empty_line=False, break_anyway=True) - - def read_and_clean_pdf_text(fp): """ @@ -631,7 +542,6 @@ def get_files_from_everything(txt, type): # type='.md' - @Singleton class nougat_interface(): def __init__(self): diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py new file mode 100644 index 0000000..575d47c --- /dev/null +++ b/crazy_functions/ipc_fns/mp.py @@ -0,0 +1,37 @@ +import platform +import pickle +import multiprocessing + +def run_in_subprocess_wrapper_func(v_args): + func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args) + import sys + try: + result = func(*args, **kwargs) + return_dict['result'] = result + except Exception as e: + exc_info = sys.exc_info() + exception_dict['exception'] = exc_info + +def run_in_subprocess_with_timeout(func, timeout=60): + if platform.system() == 'Linux': + def wrapper(*args, **kwargs): + return_dict = multiprocessing.Manager().dict() + exception_dict = multiprocessing.Manager().dict() + v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict)) + process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,)) + process.start() + process.join(timeout) + if process.is_alive(): + process.terminate() + raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务') + process.close() + if 'exception' in exception_dict: + # ooops, the subprocess ran into an exception + exc_info = exception_dict['exception'] + raise exc_info[1].with_traceback(exc_info[2]) + if 'result' in return_dict.keys(): + # If the subprocess ran successfully, return the result + return return_dict['result'] + return wrapper + else: + return func \ No newline at end of file diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index b80c01d..6638c12 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -176,12 +176,6 @@ class LatexPaperFileGroup(): self.sp_file_index = [] self.sp_file_tag = [] - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - def run_file_split(self, max_token_limit=1900): """ use tokenizer to break down text according to max_token_limit @@ -192,13 +186,12 @@ class LatexPaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from ..crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - print('Segmentation: done') def merge_result(self): self.file_result = ["" for _ in range(len(self.file_paths))] diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py new file mode 100644 index 0000000..1db8696 --- /dev/null +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -0,0 +1,125 @@ +from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout + +def force_breakdown(txt, limit, get_token_fn): + """ 当无法用标点、空行分割时,我们用最暴力的方法切割 + """ + for i in reversed(range(len(txt))): + if get_token_fn(txt[:i]) < limit: + return txt[:i], txt[i:] + return "Tiktoken未知错误", "Tiktoken未知错误" + + +def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage): + """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出 + """ + _min = int(5e4) + _max = int(1e5) + # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage)) + if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0: + remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage + remain_txt_to_cut_storage = "" + if len(remain_txt_to_cut) > _max: + remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage + remain_txt_to_cut = remain_txt_to_cut[:_max] + return remain_txt_to_cut, remain_txt_to_cut_storage + + +def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False): + """ 文本切分 + """ + res = [] + total_len = len(txt_tocut) + fin_len = 0 + remain_txt_to_cut = txt_tocut + remain_txt_to_cut_storage = "" + # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + + while True: + if get_token_fn(remain_txt_to_cut) <= limit: + # 如果剩余文本的token数小于限制,那么就不用切了 + res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut) + break + else: + # 如果剩余文本的token数大于限制,那么就切 + lines = remain_txt_to_cut.split('\n') + + # 估计一个切分点 + estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines) + estimated_line_cut = int(estimated_line_cut) + + # 开始查找合适切分点的偏移(cnt) + cnt = 0 + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line: + # 首先尝试用双空行(\n\n)作为切分点 + if lines[cnt] != "": + continue + prev = "\n".join(lines[:cnt]) + post = "\n".join(lines[cnt:]) + if get_token_fn(prev) < limit: + break + + if cnt == 0: + # 如果没有找到合适的切分点 + if break_anyway: + # 是否允许暴力切分 + prev, post = force_breakdown(txt_tocut, limit, get_token_fn) + else: + # 不允许直接报错 + raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") + + # 追加列表 + res.append(prev); fin_len+=len(prev) + # 准备下一次迭代 + remain_txt_to_cut = post + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + process = fin_len/total_len + print(f'\r正在文本切分 {int(process*100)}%', end='') + if len(remain_txt_to_cut.strip()) == 0: + break + return res + + +def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"): + """ 使用多种方式尝试切分文本,以满足 token 限制 + """ + from request_llms.bridge_all import model_info + enc = model_info[llm_model]['tokenizer'] + def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) + try: + # 第1次尝试,将双空行(\n\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=True) + except RuntimeError: + try: + # 第2次尝试,将单空行(\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False) + except RuntimeError: + try: + # 第3次尝试,将英文句号(.)作为切分点 + res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 + return [r.replace('。\n', '.') for r in res] + except RuntimeError as e: + try: + # 第4次尝试,将中文句号(。)作为切分点 + res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False) + return [r.replace('。。\n', '。') for r in res] + except RuntimeError as e: + # 第5次尝试,没办法了,随便切一下吧 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True) + +breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60) + +if __name__ == '__main__': + from crazy_functions.crazy_utils import read_and_clean_pdf_text + file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf") + + from request_llms.bridge_all import model_info + for i in range(5): + file_content += file_content + + print(len(file_content)) + TOKEN_LIMIT_PER_FRAGMENT = 2500 + res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT) + diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py index 51f8811..fa27de5 100644 --- a/crazy_functions/pdf_fns/parse_pdf.py +++ b/crazy_functions/pdf_fns/parse_pdf.py @@ -74,7 +74,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG): from crazy_functions.pdf_fns.report_gen_html import construct_html - from crazy_functions.crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency @@ -116,7 +116,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi # find a smooth token limit to achieve even seperation count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT)) token_limit_smooth = raw_token_num // count + count - return breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn=get_token_num, limit=token_limit_smooth) + return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model']) for section in article_dict.get('sections'): if len(section['text']) == 0: continue diff --git a/crazy_functions/总结word文档.py b/crazy_functions/总结word文档.py index b392307..6dfe217 100644 --- a/crazy_functions/总结word文档.py +++ b/crazy_functions/总结word文档.py @@ -31,15 +31,11 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot print(file_content) # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from request_llms.bridge_all import model_info max_token = model_info[llm_kwargs['llm_model']]['max_token'] TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4 - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, - get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'], - limit=TOKEN_LIMIT_PER_FRAGMENT - ) + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) this_paper_history = [] for i, paper_frag in enumerate(paper_fragments): i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```' diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/批量Markdown翻译.py index 12b4ef0..8665d6d 100644 --- a/crazy_functions/批量Markdown翻译.py +++ b/crazy_functions/批量Markdown翻译.py @@ -28,8 +28,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/批量总结PDF文档.py b/crazy_functions/批量总结PDF文档.py index 7fc3e41..e289c47 100644 --- a/crazy_functions/批量总结PDF文档.py +++ b/crazy_functions/批量总结PDF文档.py @@ -20,14 +20,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, TOKEN_LIMIT_PER_FRAGMENT = 2500 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 73cf592..a1f0f31 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -91,14 +91,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars # 递归地切割PDF文件 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=page_one, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py index ef96788..439d78e 100644 --- a/crazy_functions/理解PDF文档内容.py +++ b/crazy_functions/理解PDF文档内容.py @@ -18,14 +18,9 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro TOKEN_LIMIT_PER_FRAGMENT = 2500 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] @@ -45,7 +40,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro for i in range(n_fragment): NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}" - i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}" + i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]} ...." gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问 llm_kwargs, chatbot, history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果 diff --git a/crazy_functions/解析JupyterNotebook.py b/crazy_functions/解析JupyterNotebook.py index eeccadf..3c2b578 100644 --- a/crazy_functions/解析JupyterNotebook.py +++ b/crazy_functions/解析JupyterNotebook.py @@ -12,13 +12,6 @@ class PaperFileGroup(): self.sp_file_index = [] self.sp_file_tag = [] - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len( - enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - def run_file_split(self, max_token_limit=1900): """ 将长文本分离开来 @@ -29,9 +22,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf( - file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index)