From 3c271302cc4b1e06dc1fa8e11ba3bcabce92b018 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Tue, 19 Dec 2023 19:30:44 +0800 Subject: [PATCH 1/5] improve long text breakdown perfomance --- crazy_functions/Latex全文润色.py | 4 +- crazy_functions/Latex全文翻译.py | 4 +- crazy_functions/crazy_utils.py | 90 --------------- crazy_functions/ipc_fns/mp.py | 37 ++++++ crazy_functions/latex_fns/latex_actions.py | 11 +- crazy_functions/pdf_fns/breakdown_txt.py | 125 +++++++++++++++++++++ crazy_functions/pdf_fns/parse_pdf.py | 4 +- crazy_functions/总结word文档.py | 8 +- crazy_functions/批量Markdown翻译.py | 4 +- crazy_functions/批量总结PDF文档.py | 11 +- crazy_functions/批量翻译PDF文档_多线程.py | 11 +- crazy_functions/理解PDF文档内容.py | 13 +-- crazy_functions/解析JupyterNotebook.py | 12 +- 13 files changed, 186 insertions(+), 148 deletions(-) create mode 100644 crazy_functions/ipc_fns/mp.py create mode 100644 crazy_functions/pdf_fns/breakdown_txt.py diff --git a/crazy_functions/Latex全文润色.py b/crazy_functions/Latex全文润色.py index 0bc7d40..b736fe8 100644 --- a/crazy_functions/Latex全文润色.py +++ b/crazy_functions/Latex全文润色.py @@ -26,8 +26,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/Latex全文翻译.py b/crazy_functions/Latex全文翻译.py index 846bd80..49470c8 100644 --- a/crazy_functions/Latex全文翻译.py +++ b/crazy_functions/Latex全文翻译.py @@ -26,8 +26,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 9778053..731da1a 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -312,95 +312,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( return gpt_response_collection -def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit): - def cut(txt_tocut, must_break_at_empty_line): # 递归 - if get_token_fn(txt_tocut) <= limit: - return [txt_tocut] - else: - lines = txt_tocut.split('\n') - estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines) - estimated_line_cut = int(estimated_line_cut) - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line: - if lines[cnt] != "": - continue - print(cnt) - prev = "\n".join(lines[:cnt]) - post = "\n".join(lines[cnt:]) - if get_token_fn(prev) < limit: - break - if cnt == 0: - raise RuntimeError("存在一行极长的文本!") - # print(len(post)) - # 列表递归接龙 - result = [prev] - result.extend(cut(post, must_break_at_empty_line)) - return result - try: - return cut(txt, must_break_at_empty_line=True) - except RuntimeError: - return cut(txt, must_break_at_empty_line=False) - - -def force_breakdown(txt, limit, get_token_fn): - """ - 当无法用标点、空行分割时,我们用最暴力的方法切割 - """ - for i in reversed(range(len(txt))): - if get_token_fn(txt[:i]) < limit: - return txt[:i], txt[i:] - return "Tiktoken未知错误", "Tiktoken未知错误" - -def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit): - # 递归 - def cut(txt_tocut, must_break_at_empty_line, break_anyway=False): - if get_token_fn(txt_tocut) <= limit: - return [txt_tocut] - else: - lines = txt_tocut.split('\n') - estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines) - estimated_line_cut = int(estimated_line_cut) - cnt = 0 - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line: - if lines[cnt] != "": - continue - prev = "\n".join(lines[:cnt]) - post = "\n".join(lines[cnt:]) - if get_token_fn(prev) < limit: - break - if cnt == 0: - if break_anyway: - prev, post = force_breakdown(txt_tocut, limit, get_token_fn) - else: - raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") - # print(len(post)) - # 列表递归接龙 - result = [prev] - result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway)) - return result - try: - # 第1次尝试,将双空行(\n\n)作为切分点 - return cut(txt, must_break_at_empty_line=True) - except RuntimeError: - try: - # 第2次尝试,将单空行(\n)作为切分点 - return cut(txt, must_break_at_empty_line=False) - except RuntimeError: - try: - # 第3次尝试,将英文句号(.)作为切分点 - res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 - return [r.replace('。\n', '.') for r in res] - except RuntimeError as e: - try: - # 第4次尝试,将中文句号(。)作为切分点 - res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False) - return [r.replace('。。\n', '。') for r in res] - except RuntimeError as e: - # 第5次尝试,没办法了,随便切一下敷衍吧 - return cut(txt, must_break_at_empty_line=False, break_anyway=True) - - def read_and_clean_pdf_text(fp): """ @@ -631,7 +542,6 @@ def get_files_from_everything(txt, type): # type='.md' - @Singleton class nougat_interface(): def __init__(self): diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py new file mode 100644 index 0000000..575d47c --- /dev/null +++ b/crazy_functions/ipc_fns/mp.py @@ -0,0 +1,37 @@ +import platform +import pickle +import multiprocessing + +def run_in_subprocess_wrapper_func(v_args): + func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args) + import sys + try: + result = func(*args, **kwargs) + return_dict['result'] = result + except Exception as e: + exc_info = sys.exc_info() + exception_dict['exception'] = exc_info + +def run_in_subprocess_with_timeout(func, timeout=60): + if platform.system() == 'Linux': + def wrapper(*args, **kwargs): + return_dict = multiprocessing.Manager().dict() + exception_dict = multiprocessing.Manager().dict() + v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict)) + process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,)) + process.start() + process.join(timeout) + if process.is_alive(): + process.terminate() + raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务') + process.close() + if 'exception' in exception_dict: + # ooops, the subprocess ran into an exception + exc_info = exception_dict['exception'] + raise exc_info[1].with_traceback(exc_info[2]) + if 'result' in return_dict.keys(): + # If the subprocess ran successfully, return the result + return return_dict['result'] + return wrapper + else: + return func \ No newline at end of file diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index b80c01d..6638c12 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -176,12 +176,6 @@ class LatexPaperFileGroup(): self.sp_file_index = [] self.sp_file_tag = [] - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - def run_file_split(self, max_token_limit=1900): """ use tokenizer to break down text according to max_token_limit @@ -192,13 +186,12 @@ class LatexPaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from ..crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - print('Segmentation: done') def merge_result(self): self.file_result = ["" for _ in range(len(self.file_paths))] diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py new file mode 100644 index 0000000..1db8696 --- /dev/null +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -0,0 +1,125 @@ +from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout + +def force_breakdown(txt, limit, get_token_fn): + """ 当无法用标点、空行分割时,我们用最暴力的方法切割 + """ + for i in reversed(range(len(txt))): + if get_token_fn(txt[:i]) < limit: + return txt[:i], txt[i:] + return "Tiktoken未知错误", "Tiktoken未知错误" + + +def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage): + """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出 + """ + _min = int(5e4) + _max = int(1e5) + # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage)) + if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0: + remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage + remain_txt_to_cut_storage = "" + if len(remain_txt_to_cut) > _max: + remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage + remain_txt_to_cut = remain_txt_to_cut[:_max] + return remain_txt_to_cut, remain_txt_to_cut_storage + + +def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False): + """ 文本切分 + """ + res = [] + total_len = len(txt_tocut) + fin_len = 0 + remain_txt_to_cut = txt_tocut + remain_txt_to_cut_storage = "" + # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + + while True: + if get_token_fn(remain_txt_to_cut) <= limit: + # 如果剩余文本的token数小于限制,那么就不用切了 + res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut) + break + else: + # 如果剩余文本的token数大于限制,那么就切 + lines = remain_txt_to_cut.split('\n') + + # 估计一个切分点 + estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines) + estimated_line_cut = int(estimated_line_cut) + + # 开始查找合适切分点的偏移(cnt) + cnt = 0 + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line: + # 首先尝试用双空行(\n\n)作为切分点 + if lines[cnt] != "": + continue + prev = "\n".join(lines[:cnt]) + post = "\n".join(lines[cnt:]) + if get_token_fn(prev) < limit: + break + + if cnt == 0: + # 如果没有找到合适的切分点 + if break_anyway: + # 是否允许暴力切分 + prev, post = force_breakdown(txt_tocut, limit, get_token_fn) + else: + # 不允许直接报错 + raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") + + # 追加列表 + res.append(prev); fin_len+=len(prev) + # 准备下一次迭代 + remain_txt_to_cut = post + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + process = fin_len/total_len + print(f'\r正在文本切分 {int(process*100)}%', end='') + if len(remain_txt_to_cut.strip()) == 0: + break + return res + + +def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"): + """ 使用多种方式尝试切分文本,以满足 token 限制 + """ + from request_llms.bridge_all import model_info + enc = model_info[llm_model]['tokenizer'] + def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) + try: + # 第1次尝试,将双空行(\n\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=True) + except RuntimeError: + try: + # 第2次尝试,将单空行(\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False) + except RuntimeError: + try: + # 第3次尝试,将英文句号(.)作为切分点 + res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 + return [r.replace('。\n', '.') for r in res] + except RuntimeError as e: + try: + # 第4次尝试,将中文句号(。)作为切分点 + res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False) + return [r.replace('。。\n', '。') for r in res] + except RuntimeError as e: + # 第5次尝试,没办法了,随便切一下吧 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True) + +breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60) + +if __name__ == '__main__': + from crazy_functions.crazy_utils import read_and_clean_pdf_text + file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf") + + from request_llms.bridge_all import model_info + for i in range(5): + file_content += file_content + + print(len(file_content)) + TOKEN_LIMIT_PER_FRAGMENT = 2500 + res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT) + diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py index 51f8811..fa27de5 100644 --- a/crazy_functions/pdf_fns/parse_pdf.py +++ b/crazy_functions/pdf_fns/parse_pdf.py @@ -74,7 +74,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG): from crazy_functions.pdf_fns.report_gen_html import construct_html - from crazy_functions.crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency @@ -116,7 +116,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi # find a smooth token limit to achieve even seperation count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT)) token_limit_smooth = raw_token_num // count + count - return breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn=get_token_num, limit=token_limit_smooth) + return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model']) for section in article_dict.get('sections'): if len(section['text']) == 0: continue diff --git a/crazy_functions/总结word文档.py b/crazy_functions/总结word文档.py index b392307..6dfe217 100644 --- a/crazy_functions/总结word文档.py +++ b/crazy_functions/总结word文档.py @@ -31,15 +31,11 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot print(file_content) # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from request_llms.bridge_all import model_info max_token = model_info[llm_kwargs['llm_model']]['max_token'] TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4 - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, - get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'], - limit=TOKEN_LIMIT_PER_FRAGMENT - ) + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) this_paper_history = [] for i, paper_frag in enumerate(paper_fragments): i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```' diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/批量Markdown翻译.py index 12b4ef0..8665d6d 100644 --- a/crazy_functions/批量Markdown翻译.py +++ b/crazy_functions/批量Markdown翻译.py @@ -28,8 +28,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) diff --git a/crazy_functions/批量总结PDF文档.py b/crazy_functions/批量总结PDF文档.py index 7fc3e41..e289c47 100644 --- a/crazy_functions/批量总结PDF文档.py +++ b/crazy_functions/批量总结PDF文档.py @@ -20,14 +20,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, TOKEN_LIMIT_PER_FRAGMENT = 2500 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 73cf592..a1f0f31 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -91,14 +91,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars # 递归地切割PDF文件 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=page_one, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] diff --git a/crazy_functions/理解PDF文档内容.py b/crazy_functions/理解PDF文档内容.py index ef96788..439d78e 100644 --- a/crazy_functions/理解PDF文档内容.py +++ b/crazy_functions/理解PDF文档内容.py @@ -18,14 +18,9 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro TOKEN_LIMIT_PER_FRAGMENT = 2500 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] @@ -45,7 +40,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro for i in range(n_fragment): NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}" - i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}" + i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]} ...." gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问 llm_kwargs, chatbot, history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果 diff --git a/crazy_functions/解析JupyterNotebook.py b/crazy_functions/解析JupyterNotebook.py index eeccadf..3c2b578 100644 --- a/crazy_functions/解析JupyterNotebook.py +++ b/crazy_functions/解析JupyterNotebook.py @@ -12,13 +12,6 @@ class PaperFileGroup(): self.sp_file_index = [] self.sp_file_tag = [] - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len( - enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - def run_file_split(self, max_token_limit=1900): """ 将长文本分离开来 @@ -29,9 +22,8 @@ class PaperFileGroup(): self.sp_file_index.append(index) self.sp_file_tag.append(self.file_paths[index]) else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf( - file_content, self.get_token_num, max_token_limit) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) for j, segment in enumerate(segments): self.sp_file_contents.append(segment) self.sp_file_index.append(index) From 9479dd984c3ff07bfe0cf963be220299607fbad7 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Tue, 19 Dec 2023 19:43:03 +0800 Subject: [PATCH 2/5] avoid adding the same file multiple times to the chatbot's files_to_promote list --- crazy_functions/pdf_fns/breakdown_txt.py | 2 +- crazy_functions/总结word文档.py | 1 - toolbox.py | 3 ++- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py index 1db8696..a961481 100644 --- a/crazy_functions/pdf_fns/breakdown_txt.py +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -76,7 +76,7 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F remain_txt_to_cut = post remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) process = fin_len/total_len - print(f'\r正在文本切分 {int(process*100)}%', end='') + print(f'正在文本切分 {int(process*100)}%') if len(remain_txt_to_cut.strip()) == 0: break return res diff --git a/crazy_functions/总结word文档.py b/crazy_functions/总结word文档.py index 6dfe217..01ee1e6 100644 --- a/crazy_functions/总结word文档.py +++ b/crazy_functions/总结word文档.py @@ -29,7 +29,6 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot except: raise RuntimeError('请先将.doc文档转换为.docx文档。') - print(file_content) # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名 from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from request_llms.bridge_all import model_info diff --git a/toolbox.py b/toolbox.py index bb4ec66..e44d61e 100644 --- a/toolbox.py +++ b/toolbox.py @@ -583,7 +583,8 @@ def promote_file_to_downloadzone(file, rename_file=None, chatbot=None): if chatbot is not None: if 'files_to_promote' in chatbot._cookies: current = chatbot._cookies['files_to_promote'] else: current = [] - chatbot._cookies.update({'files_to_promote': [new_path] + current}) + if new_path not in current: # 避免把同一个文件添加多次 + chatbot._cookies.update({'files_to_promote': [new_path] + current}) return new_path From ac3d4cf073e10fdb854e6daf163af33ec0de1490 Mon Sep 17 00:00:00 2001 From: leike0813 Date: Wed, 20 Dec 2023 07:37:26 +0800 Subject: [PATCH 3/5] Add support to aliyun qwen online models. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename model tag "qwen" to "qwen-local" Add model tag "qwen-turbo", "qwen-plus", "qwen-max" Add corresponding model interfaces in request_llms/bridge_all.py Add configuration variable “DASHSCOPE_API_KEY" Rename request_llms/bridge_qwen.py to bridge_qwen_local.py to distinguish it from the online model interface --- config.py | 16 +++- docs/translate_english.json | 2 +- request_llms/bridge_all.py | 38 +++++++- request_llms/bridge_qwen.py | 107 ++++++++++++----------- request_llms/bridge_qwen_local.py | 59 +++++++++++++ request_llms/com_qwenapi.py | 85 ++++++++++++++++++ request_llms/requirements_qwen.txt | 5 +- request_llms/requirements_qwen_local.txt | 4 + tests/test_llms.py | 2 +- 9 files changed, 255 insertions(+), 63 deletions(-) create mode 100644 request_llms/bridge_qwen_local.py create mode 100644 request_llms/com_qwenapi.py create mode 100644 request_llms/requirements_qwen_local.txt diff --git a/config.py b/config.py index 3d80962..17dac34 100644 --- a/config.py +++ b/config.py @@ -92,8 +92,9 @@ AVAIL_LLM_MODELS = ["gpt-3.5-turbo-1106","gpt-4-1106-preview","gpt-4-vision-prev "api2d-gpt-3.5-turbo", 'api2d-gpt-3.5-turbo-16k', "gpt-4", "gpt-4-32k", "azure-gpt-4", "api2d-gpt-4", "chatglm3", "moss", "claude-2"] -# P.S. 其他可用的模型还包括 ["zhipuai", "qianfan", "deepseekcoder", "llama2", "qwen", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-3.5-random" -# "spark", "sparkv2", "sparkv3", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_pangualpha", "jittorllms_llama"] +# P.S. 其他可用的模型还包括 ["zhipuai", "qianfan", "deepseekcoder", "llama2", "qwen-local", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-3.5-random" +# "spark", "sparkv2", "sparkv3", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_pangualpha", "jittorllms_llama" +# “qwen-turbo", "qwen-plus", "qwen-max"] # 定义界面上“询问多个GPT模型”插件应该使用哪些模型,请从AVAIL_LLM_MODELS中选择,并在不同模型之间用`&`间隔,例如"gpt-3.5-turbo&chatglm3&azure-gpt-4" @@ -103,7 +104,11 @@ MULTI_QUERY_LLM_MODELS = "gpt-3.5-turbo&chatglm3" # 选择本地模型变体(只有当AVAIL_LLM_MODELS包含了对应本地模型时,才会起作用) # 如果你选择Qwen系列的模型,那么请在下面的QWEN_MODEL_SELECTION中指定具体的模型 # 也可以是具体的模型路径 -QWEN_MODEL_SELECTION = "Qwen/Qwen-1_8B-Chat-Int8" +QWEN_LOCAL_MODEL_SELECTION = "Qwen/Qwen-1_8B-Chat-Int8" + + +# 接入通义千问在线大模型 https://dashscope.console.aliyun.com/ +DASHSCOPE_API_KEY = "此处填阿里灵积云API秘钥" # 阿里灵积云API_KEY # 百度千帆(LLM_MODEL="qianfan") @@ -284,6 +289,9 @@ NUM_CUSTOM_BASIC_BTN = 4 │ ├── ZHIPUAI_API_KEY │ └── ZHIPUAI_MODEL │ +├── "qwen-turbo" 等通义千问大模型 +│ └── DASHSCOPE_API_KEY +│ └── "newbing" Newbing接口不再稳定,不推荐使用 ├── NEWBING_STYLE └── NEWBING_COOKIES @@ -300,7 +308,7 @@ NUM_CUSTOM_BASIC_BTN = 4 ├── "jittorllms_pangualpha" ├── "jittorllms_llama" ├── "deepseekcoder" -├── "qwen" +├── "qwen-local" ├── RWKV的支持见Wiki └── "llama2" diff --git a/docs/translate_english.json b/docs/translate_english.json index 3920e1f..c48ec6b 100644 --- a/docs/translate_english.json +++ b/docs/translate_english.json @@ -2932,7 +2932,7 @@ "3. 输入修改需求": "3. Enter modification requirements", "刷新界面 由于请求gpt需要一段时间": "Refreshing the interface takes some time due to the request for gpt", "随机小游戏": "Random mini game", - "那么请在下面的QWEN_MODEL_SELECTION中指定具体的模型": "So please specify the specific model in QWEN_MODEL_SELECTION below", + "那么请在下面的QWEN_LOCAL_MODEL_SELECTION中指定具体的模型": "So please specify the specific model in QWEN_LOCAL_MODEL_SELECTION below", "表值": "Table value", "我画你猜": "I draw, you guess", "狗": "Dog", diff --git a/request_llms/bridge_all.py b/request_llms/bridge_all.py index dcfeba9..689b1f9 100644 --- a/request_llms/bridge_all.py +++ b/request_llms/bridge_all.py @@ -431,16 +431,48 @@ if "chatglm_onnx" in AVAIL_LLM_MODELS: }) except: print(trimmed_format_exc()) -if "qwen" in AVAIL_LLM_MODELS: +if "qwen-local" in AVAIL_LLM_MODELS: + try: + from .bridge_qwen_local import predict_no_ui_long_connection as qwen_local_noui + from .bridge_qwen_local import predict as qwen_local_ui + model_info.update({ + "qwen-local": { + "fn_with_ui": qwen_local_ui, + "fn_without_ui": qwen_local_noui, + "endpoint": None, + "max_token": 4096, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + } + }) + except: + print(trimmed_format_exc()) +if "qwen-turbo" in AVAIL_LLM_MODELS or "qwen-plus" in AVAIL_LLM_MODELS or "qwen-max" in AVAIL_LLM_MODELS: # zhipuai try: from .bridge_qwen import predict_no_ui_long_connection as qwen_noui from .bridge_qwen import predict as qwen_ui model_info.update({ - "qwen": { + "qwen-turbo": { "fn_with_ui": qwen_ui, "fn_without_ui": qwen_noui, "endpoint": None, - "max_token": 4096, + "max_token": 6144, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + }, + "qwen-plus": { + "fn_with_ui": qwen_ui, + "fn_without_ui": qwen_noui, + "endpoint": None, + "max_token": 30720, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + }, + "qwen-max": { + "fn_with_ui": qwen_ui, + "fn_without_ui": qwen_noui, + "endpoint": None, + "max_token": 28672, "tokenizer": tokenizer_gpt35, "token_cnt": get_token_num_gpt35, } diff --git a/request_llms/bridge_qwen.py b/request_llms/bridge_qwen.py index 940c41d..583def8 100644 --- a/request_llms/bridge_qwen.py +++ b/request_llms/bridge_qwen.py @@ -1,59 +1,66 @@ -model_name = "Qwen" -cmd_to_install = "`pip install -r request_llms/requirements_qwen.txt`" +import time +import os +from toolbox import update_ui, get_conf, update_ui_lastest_msg +from toolbox import check_packages, report_exception -from toolbox import ProxyNetworkActivate, get_conf -from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns +model_name = 'Qwen' + +def validate_key(): + DASHSCOPE_API_KEY = get_conf("DASHSCOPE_API_KEY") + if DASHSCOPE_API_KEY == '': return False + return True + +if not validate_key(): + raise RuntimeError('请配置DASHSCOPE_API_KEY') +os.environ['DASHSCOPE_API_KEY'] = get_conf("DASHSCOPE_API_KEY") +def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False): + """ + ⭐多线程方法 + 函数的说明请见 request_llms/bridge_all.py + """ + watch_dog_patience = 5 + response = "" -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 Local Model -# ------------------------------------------------------------------------------------------------------------------------ -class GetQwenLMHandle(LocalLLMHandle): + from .com_qwenapi import QwenRequestInstance + sri = QwenRequestInstance() + for response in sri.generate(inputs, llm_kwargs, history, sys_prompt): + if len(observe_window) >= 1: + observe_window[0] = response + if len(observe_window) >= 2: + if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。") + return response - def load_model_info(self): - # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 - self.model_name = model_name - self.cmd_to_install = cmd_to_install +def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None): + """ + ⭐单线程方法 + 函数的说明请见 request_llms/bridge_all.py + """ + chatbot.append((inputs, "")) + yield from update_ui(chatbot=chatbot, history=history) - def load_model_and_tokenizer(self): - # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 - # from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - with ProxyNetworkActivate('Download_LLM'): - model_id = get_conf('QWEN_MODEL_SELECTION') - self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, resume_download=True) - # use fp16 - model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 - self._model = model + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + check_packages(["dashscope"]) + except: + yield from update_ui_lastest_msg(f"导入软件依赖失败。使用该模型需要额外依赖,安装方法```pip install --upgrade dashscope```。", + chatbot=chatbot, history=history, delay=0) + return - return self._model, self._tokenizer + if additional_fn is not None: + from core_functional import handle_core_functionality + inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot) - def llm_stream_generator(self, **kwargs): - # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 - def adaptor(kwargs): - query = kwargs['query'] - max_length = kwargs['max_length'] - top_p = kwargs['top_p'] - temperature = kwargs['temperature'] - history = kwargs['history'] - return query, max_length, top_p, temperature, history + # 开始接收回复 + from .com_qwenapi import QwenRequestInstance + sri = QwenRequestInstance() + for response in sri.generate(inputs, llm_kwargs, history, system_prompt): + chatbot[-1] = (inputs, response) + yield from update_ui(chatbot=chatbot, history=history) - query, max_length, top_p, temperature, history = adaptor(kwargs) - - for response in self._model.chat_stream(self._tokenizer, query, history=history): - yield response - - def try_to_import_special_deps(self, **kwargs): - # import something that will raise error if the user does not install requirement_*.txt - # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行 - import importlib - importlib.import_module('modelscope') - - -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 GPT-Academic Interface -# ------------------------------------------------------------------------------------------------------------------------ -predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetQwenLMHandle, model_name) \ No newline at end of file + # 总结输出 + if response == f"[Local Message] 等待{model_name}响应中 ...": + response = f"[Local Message] {model_name}响应异常 ..." + history.extend([inputs, response]) + yield from update_ui(chatbot=chatbot, history=history) \ No newline at end of file diff --git a/request_llms/bridge_qwen_local.py b/request_llms/bridge_qwen_local.py new file mode 100644 index 0000000..4a0fa69 --- /dev/null +++ b/request_llms/bridge_qwen_local.py @@ -0,0 +1,59 @@ +model_name = "Qwen_local" +cmd_to_install = "`pip install -r request_llms/requirements_qwen_local.txt`" + +from toolbox import ProxyNetworkActivate, get_conf +from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Local Model +# ------------------------------------------------------------------------------------------------------------------------ +class GetQwenLMHandle(LocalLLMHandle): + + def load_model_info(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + self.model_name = model_name + self.cmd_to_install = cmd_to_install + + def load_model_and_tokenizer(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + # from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + with ProxyNetworkActivate('Download_LLM'): + model_id = get_conf('QWEN_LOCAL_MODEL_SELECTION') + self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, resume_download=True) + # use fp16 + model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval() + model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 + self._model = model + + return self._model, self._tokenizer + + def llm_stream_generator(self, **kwargs): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + def adaptor(kwargs): + query = kwargs['query'] + max_length = kwargs['max_length'] + top_p = kwargs['top_p'] + temperature = kwargs['temperature'] + history = kwargs['history'] + return query, max_length, top_p, temperature, history + + query, max_length, top_p, temperature, history = adaptor(kwargs) + + for response in self._model.chat_stream(self._tokenizer, query, history=history): + yield response + + def try_to_import_special_deps(self, **kwargs): + # import something that will raise error if the user does not install requirement_*.txt + # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行 + import importlib + importlib.import_module('modelscope') + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 GPT-Academic Interface +# ------------------------------------------------------------------------------------------------------------------------ +predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetQwenLMHandle, model_name) \ No newline at end of file diff --git a/request_llms/com_qwenapi.py b/request_llms/com_qwenapi.py new file mode 100644 index 0000000..63ebdea --- /dev/null +++ b/request_llms/com_qwenapi.py @@ -0,0 +1,85 @@ +from http import HTTPStatus +from toolbox import get_conf +import threading +import logging + +timeout_bot_msg = '[Local Message] Request timeout. Network error.' + +class QwenRequestInstance(): + def __init__(self): + + self.time_to_yield_event = threading.Event() + self.time_to_exit_event = threading.Event() + + self.result_buf = "" + + def generate(self, inputs, llm_kwargs, history, system_prompt): + # import _thread as thread + from dashscope import Generation + QWEN_MODEL = { + 'qwen-turbo': Generation.Models.qwen_turbo, + 'qwen-plus': Generation.Models.qwen_plus, + 'qwen-max': Generation.Models.qwen_max, + }[llm_kwargs['llm_model']] + top_p = llm_kwargs.get('top_p', 0.8) + if top_p == 0: top_p += 1e-5 + if top_p == 1: top_p -= 1e-5 + + self.result_buf = "" + responses = Generation.call( + model=QWEN_MODEL, + messages=generate_message_payload(inputs, llm_kwargs, history, system_prompt), + top_p=top_p, + temperature=llm_kwargs.get('temperature', 1.0), + result_format='message', + stream=True, + incremental_output=True + ) + + for response in responses: + if response.status_code == HTTPStatus.OK: + if response.output.choices[0].finish_reason == 'stop': + yield self.result_buf + break + elif response.output.choices[0].finish_reason == 'length': + self.result_buf += "[Local Message] 生成长度过长,后续输出被截断" + yield self.result_buf + break + else: + self.result_buf += response.output.choices[0].message.content + yield self.result_buf + else: + self.result_buf += f"[Local Message] 请求错误:状态码:{response.status_code},错误码:{response.code},消息:{response.message}" + yield self.result_buf + break + logging.info(f'[raw_input] {inputs}') + logging.info(f'[response] {self.result_buf}') + return self.result_buf + + +def generate_message_payload(inputs, llm_kwargs, history, system_prompt): + conversation_cnt = len(history) // 2 + if system_prompt == '': system_prompt = 'Hello!' + messages = [{"role": "user", "content": system_prompt}, {"role": "assistant", "content": "Certainly!"}] + if conversation_cnt: + for index in range(0, 2*conversation_cnt, 2): + what_i_have_asked = {} + what_i_have_asked["role"] = "user" + what_i_have_asked["content"] = history[index] + what_gpt_answer = {} + what_gpt_answer["role"] = "assistant" + what_gpt_answer["content"] = history[index+1] + if what_i_have_asked["content"] != "": + if what_gpt_answer["content"] == "": + continue + if what_gpt_answer["content"] == timeout_bot_msg: + continue + messages.append(what_i_have_asked) + messages.append(what_gpt_answer) + else: + messages[-1]['content'] = what_gpt_answer['content'] + what_i_ask_now = {} + what_i_ask_now["role"] = "user" + what_i_ask_now["content"] = inputs + messages.append(what_i_ask_now) + return messages diff --git a/request_llms/requirements_qwen.txt b/request_llms/requirements_qwen.txt index ea65dee..5899464 100644 --- a/request_llms/requirements_qwen.txt +++ b/request_llms/requirements_qwen.txt @@ -1,4 +1 @@ -modelscope -transformers_stream_generator -auto-gptq -optimum \ No newline at end of file +dashscope \ No newline at end of file diff --git a/request_llms/requirements_qwen_local.txt b/request_llms/requirements_qwen_local.txt new file mode 100644 index 0000000..ea65dee --- /dev/null +++ b/request_llms/requirements_qwen_local.txt @@ -0,0 +1,4 @@ +modelscope +transformers_stream_generator +auto-gptq +optimum \ No newline at end of file diff --git a/tests/test_llms.py b/tests/test_llms.py index bdb622b..347c6b9 100644 --- a/tests/test_llms.py +++ b/tests/test_llms.py @@ -18,7 +18,7 @@ if __name__ == "__main__": # from request_llms.bridge_internlm import predict_no_ui_long_connection # from request_llms.bridge_deepseekcoder import predict_no_ui_long_connection # from request_llms.bridge_qwen_7B import predict_no_ui_long_connection - from request_llms.bridge_qwen import predict_no_ui_long_connection + from request_llms.bridge_qwen_local import predict_no_ui_long_connection # from request_llms.bridge_spark import predict_no_ui_long_connection # from request_llms.bridge_zhipu import predict_no_ui_long_connection # from request_llms.bridge_chatglm3 import predict_no_ui_long_connection From 68a49d3758782772afb912b327acd504fe0f1e99 Mon Sep 17 00:00:00 2001 From: leike0813 Date: Wed, 20 Dec 2023 07:44:53 +0800 Subject: [PATCH 4/5] Add 2 plugins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 相当于将“批量总结PDF文档”插件拆成了两部分,目的在于使用廉价的模型干粗活,再将关键的最终总结交给GPT-4,降低使用成本 批量总结PDF文档_初步:初步总结PDF,每个PDF输出一个md文档 批量总结Markdown文档_进阶:将所有md文档高度凝练并汇总至一个md文档,可直接使用“批量总结PDF文档_初步”的输出结果作为输入 --- crazy_functional.py | 28 ++++ crazy_functions/批量总结Markdown文档_进阶.py | 127 ++++++++++++++++++ crazy_functions/批量总结PDF文档_初步.py | 131 +++++++++++++++++++ 3 files changed, 286 insertions(+) create mode 100644 crazy_functions/批量总结Markdown文档_进阶.py create mode 100644 crazy_functions/批量总结PDF文档_初步.py diff --git a/crazy_functional.py b/crazy_functional.py index 4cc6304..3275f79 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -603,7 +603,35 @@ def get_crazy_functions(): except: print(trimmed_format_exc()) print('Load function plugin failed') + try: + from crazy_functions.批量总结PDF文档_初步 import 批量总结PDF文档_初步 + function_plugins.update({ + "批量总结PDF文档_初步": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "Info": "批量总结PDF文档的内容(仅做初步提炼) | 输入参数为路径", + "Function": HotReload(批量总结PDF文档_初步) + } + }) + except: + print(trimmed_format_exc()) + print('Load function plugin failed') + try: + from crazy_functions.批量总结Markdown文档_进阶 import 批量总结Markdown文档_进阶 + function_plugins.update({ + "批量总结Markdown文档_进阶": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "Info": "批量总结Markdown文档的内容(在初步提炼的基础上进一步总结) | 输入参数为路径", + "Function": HotReload(批量总结Markdown文档_进阶) + } + }) + except: + print(trimmed_format_exc()) + print('Load function plugin failed') # try: # from crazy_functions.chatglm微调工具 import 微调数据集生成 # function_plugins.update({ diff --git a/crazy_functions/批量总结Markdown文档_进阶.py b/crazy_functions/批量总结Markdown文档_进阶.py new file mode 100644 index 0000000..cdbff7a --- /dev/null +++ b/crazy_functions/批量总结Markdown文档_进阶.py @@ -0,0 +1,127 @@ +import logging, os +from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder +from toolbox import CatchException, report_exception, trimmed_format_exc +from toolbox import write_history_to_file, promote_file_to_downloadzone +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from .crazy_utils import input_clipping + + +def 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + file_write_buffer = [] + SUMMARY_WORD_LIMIT = 800 + meta_inputs_array = [] + meta_inputs_show_user_array = [] + meta_sys_prompt_array = [] + inputs_array = [] + inputs_show_user_array = [] + sys_prompt_array = [] + file_name_array = [] + for idx, file_name in enumerate(file_manifest): + print('begin analysis on:', file_name) + file_name_array.append(f'# {idx}.{os.path.basename(file_name)}') + + with open(file_name, 'r', encoding='utf-8', errors='replace') as f: + file_content = f.read() + + _ = file_content.split('## metadata') + if len(_) >= 2: + file_meta = _[-2] + file_content = _[-1] + else: + file_meta = file_name + + meta_inputs_array.append( + "我需要你从一段文本中识别并提取出这篇文章的1.标题、2.作者、3.作者单位、4.关键词。" + "其中,1.标题和4.关键词需要给出中文和英文的双语结果,2.作者和3.作者单位按原文语言给出。" + "以下是需要你识别的文本: " + file_meta + ) + meta_inputs_show_user_array.append( + '开始分析元数据:' + file_name + ) + meta_sys_prompt_array.append("As an academic professional, you need to extract basic informations of the paper from its metadata") + + inputs_array.append( + "我需要你根据我提供的文本总结一份Markdown文档,分为四个部分:1.研究背景,2.文章主要内容,3.主要创新点,4.结论。" + + f"各部分的题目采用二级标题前缀(## ),内容可适当的分为若干条,总字数不超过{SUMMARY_WORD_LIMIT}个中文字符." + + "以下是需要你处理的文本: " + file_content) + inputs_show_user_array.append('开始总结:' + file_name) + sys_prompt_array.append(f"As an academic professional, you need to summarize the text with less than {SUMMARY_WORD_LIMIT} Chinese characters") + + gpt_meta_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=meta_inputs_array, + inputs_show_user_array=meta_inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(len(inputs_array))], + sys_prompt_array=meta_sys_prompt_array, + # max_workers=5, # OpenAI所允许的最大并行过载 + scroller_max_len=80 + ) + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(len(inputs_array))], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # OpenAI所允许的最大并行过载 + scroller_max_len=80 + ) + try: + for idx, (gpt_say_meta, gpt_say) in enumerate(zip(gpt_meta_response_collection[1::2], gpt_response_collection[1::2])): + file_write_buffer.append(file_name_array[idx]) + file_write_buffer.append("## 元数据\n\n" + gpt_say_meta) + file_write_buffer.append(gpt_say) + except: + logging.error(trimmed_format_exc()) + + res = write_history_to_file(file_write_buffer, file_basename="result.md", auto_caption=False) + promote_file_to_downloadzone(res, chatbot=chatbot) + yield from update_ui(chatbot=chatbot, history=gpt_response_collection) # 刷新界面 + + +@CatchException +def 批量总结Markdown文档_进阶(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + import glob, os + + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "批量总结Markdown文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import fitz + except: + report_exception(chatbot, history, + a = f"解析项目: {txt}", + b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 清空历史,以免输入溢出 + history = [] + + # 检测输入参数,如没有给定输入参数,直接退出 + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 搜索需要处理的文件清单 + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.md', recursive=True)] + + # 如果没找到任何文件 + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 开始正式执行任务 + yield from 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) diff --git a/crazy_functions/批量总结PDF文档_初步.py b/crazy_functions/批量总结PDF文档_初步.py new file mode 100644 index 0000000..5628fa1 --- /dev/null +++ b/crazy_functions/批量总结PDF文档_初步.py @@ -0,0 +1,131 @@ +import zipfile +import os +from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder +from toolbox import CatchException, report_exception +from toolbox import write_history_to_file, promote_file_to_downloadzone +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from .crazy_utils import read_and_clean_pdf_text +from .crazy_utils import input_clipping +pj = os.path.join + + +def move_file_to_zip(file_path, zip_file): + zip_file.write(file_path, os.path.basename(file_path)) + os.remove(file_path) + + +def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + zip_file_path = pj(get_log_folder(), 'result.zip') + with zipfile.ZipFile(zip_file_path, 'w') as zip_file: + for file_name in file_manifest: + file_write_buffer = [] + print('begin analysis on:', file_name) + ############################## <第 0 步,切割PDF> ################################## + # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割) + # 的长度必须小于 2500 个 Token + file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF + file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + + TOKEN_LIMIT_PER_FRAGMENT = 2500 + + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + from request_llms.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( + txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) + page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( + txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + # 为了更好的效果,我们剥离Introduction之后的部分(如果有) + paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] + + ############################## <第 1 步,从摘要中提取高价值信息,放到history中> ################################## + final_results = [] + final_results.append("## metadata\n\n" + paper_meta + "\n\n## metadata") + + ############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ################################## + i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示 + chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI + + iteration_results = [] + last_iteration_result = paper_meta # 初始值是摘要 + MAX_WORD_TOTAL = 4096 * 0.7 + n_fragment = len(paper_fragments) + if n_fragment >= 20: print('文章极长,不能达到预期效果') + for i in range(n_fragment): + NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment + i_say = f"Read this section, recapitulate the content of this section in Chinese with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}" + i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}" + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问 + llm_kwargs, chatbot, + history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果 + sys_prompt="Extract the main idea of this section with Chinese." # 提示 + ) + iteration_results.append(gpt_say) + last_iteration_result = gpt_say + + ############################## <第 3 步,整理history,提取总结> ################################## + final_results.extend(iteration_results) + file_write_buffer.extend(final_results) + + ############################## <第 4 步,设置一个token上限> ################################## + _, final_results = input_clipping("", final_results, max_token_limit=3200) + yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了 + + res = write_history_to_file( + file_write_buffer, + file_basename=os.path.splitext(os.path.basename(file_name))[0] + '.md', + auto_caption=False + ) + if len(file_manifest) == 1: + promote_file_to_downloadzone(res, chatbot=chatbot) + return + move_file_to_zip(res, zip_file) + + promote_file_to_downloadzone(zip_file_path, chatbot=chatbot) + + +@CatchException +def 批量总结PDF文档_初步(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + import glob, os + + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "批量总结PDF文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import fitz + except: + report_exception(chatbot, history, + a = f"解析项目: {txt}", + b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 清空历史,以免输入溢出 + history = [] + + # 检测输入参数,如没有给定输入参数,直接退出 + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 搜索需要处理的文件清单 + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] + + # 如果没找到任何文件 + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 开始正式执行任务 + yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) From c60a7452bfcaa22588b9fe5bdae3b7fc94b8927b Mon Sep 17 00:00:00 2001 From: leike0813 Date: Wed, 20 Dec 2023 08:57:27 +0800 Subject: [PATCH 5/5] Improve NOUGAT pdf plugin Add an API version of NOUGAT plugin Add advanced argument support to NOUGAT plugin Adapt new text breakdown function bugfix --- config.py | 4 + crazy_functional.py | 13 ++- crazy_functions/crazy_utils.py | 101 +++++++++++++++++++++- crazy_functions/批量总结PDF文档_初步.py | 11 +-- crazy_functions/批量翻译PDF文档_NOUGAT.py | 90 ++++++++++++++++++- 5 files changed, 206 insertions(+), 13 deletions(-) diff --git a/config.py b/config.py index 17dac34..dc8ef9b 100644 --- a/config.py +++ b/config.py @@ -217,6 +217,10 @@ GROBID_URLS = [ ] +# NOUGAT_API主机地址 +NOUGAT_URLS = ["http://localhost:8503"] # 此处填写NOUGAT_API的主机地址 + + # 是否允许通过自然语言描述修改本页的配置,该功能具有一定的危险性,默认关闭 ALLOW_RESET_CONFIG = False diff --git a/crazy_functional.py b/crazy_functional.py index 3275f79..d6a9dd9 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -549,13 +549,24 @@ def get_crazy_functions(): print('Load function plugin failed') try: - from crazy_functions.批量翻译PDF文档_NOUGAT import 批量翻译PDF文档 + from crazy_functions.批量翻译PDF文档_NOUGAT import 批量翻译PDF文档, 批量翻译PDF文档_API function_plugins.update({ "精准翻译PDF文档(NOUGAT)": { "Group": "学术", "Color": "stop", "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "在这里输入自定义参数, 支持的参数有: --batchsize BATCHSIZE, --model MODEL_TAG, --recompute, --full-precision, --no-markdown --no-skipping, --pages PAGES/-p PAGES", # 高级参数输入区的显示提示 "Function": HotReload(批量翻译PDF文档) + }, + "精准翻译PDF文档(NOUGAT_API)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "在这里输入自定义参数, 支持的参数有: --batchsize BATCHSIZE, --recompute, --no-markdown --no-skipping, --pages PAGES/-p PAGES (官方版本的API仅支持--pages参数)", + # 高级参数输入区的显示提示 + "Function": HotReload(批量翻译PDF文档_API) } }) except: diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 731da1a..a583e25 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -545,7 +545,20 @@ def get_files_from_everything(txt, type): # type='.md' @Singleton class nougat_interface(): def __init__(self): + def model_check(model_tag): + if model_tag in ['0.1.0-small', '0.1.0-base']: return model_tag + return '0.1.0-small' + + import argparse self.threadLock = threading.Lock() + self.arg_parser = argparse.ArgumentParser() + self.arg_parser.add_argument('--batchsize', type=int) + self.arg_parser.add_argument('--model', type=model_check) + self.arg_parser.add_argument('--recompute', action='store_true') + self.arg_parser.add_argument('--full-precision', action='store_true') + self.arg_parser.add_argument('--no-markdown', action='store_true') + self.arg_parser.add_argument('--no-skipping', action='store_true') + self.arg_parser.add_argument('--pages', type=str) def nougat_with_timeout(self, command, cwd, timeout=3600): import subprocess @@ -563,7 +576,7 @@ class nougat_interface(): return True - def NOUGAT_parse_pdf(self, fp, chatbot, history): + def NOUGAT_parse_pdf(self, fp, chatbot, history, advanced_cfg=''): from toolbox import update_ui_lastest_msg yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...", @@ -576,7 +589,10 @@ class nougat_interface(): yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)", chatbot=chatbot, history=history, delay=0) - self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600) + self.nougat_with_timeout( + f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}" {self.parse_argument(advanced_cfg)}', + os.getcwd(), timeout=3600 + ) res = glob.glob(os.path.join(dst,'*.mmd')) if len(res) == 0: self.threadLock.release() @@ -585,6 +601,87 @@ class nougat_interface(): return res[0] + def NOUGAT_API_parse_pdf(self, fp, chatbot, history, nougat_url, advanced_cfg=''): + from toolbox import update_ui_lastest_msg + + yield from update_ui_lastest_msg("正在解析论文, 请稍候。", + chatbot=chatbot, history=history, delay=0) + + import requests + from toolbox import get_log_folder, gen_time_str + dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str()) + os.makedirs(dst) + + ret = requests.post( + f'{nougat_url}/predict{self.parse_api_argument(advanced_cfg)}', + files={"file": open(fp, "rb")} + ) + if ret.status_code != 200: + raise RuntimeError("Nougat解析论文失败。") + + with open(os.path.join(dst, '*.mmd'), 'w') as f: + f.write(ret.json()) + return os.path.join(dst, '*.mmd') + + + def parse_argument(self, argument_string): + args, _ = self.arg_parser.parse_known_args(argument_string.split()) + reduce_args = [] + for k, v in args.__dict__.items(): + if (v is not None) and (v is not False): + reduce_args.append('--' + k.replace('_', '-')) + if not isinstance(v, bool) and v is not None: + reduce_args.append(str(v)) + + return ' '.join(reduce_args) + + + def parse_api_argument(self, argument_string): + def parse_pages(pages_string): + if pages_string.count(',') > 0: + pages_list = pages_string.split(',') + page_start = pages_list[0].split('-')[0] if '-' in pages_list[0] else pages_list[0] + page_end = pages_list[-1].split('-')[-1] if '-' in pages_list[-1] else pages_list[-1] + else: + if '-' in pages_string: + page_start = pages_string.split('-')[0] + page_end = pages_string.split('-')[-1] + else: + page_start = page_end = int(pages_string) + + return page_start, page_end + + args, _ = self.arg_parser.parse_known_args(argument_string.split()) + reduce_args = [] + for k, v in args.__dict__.items(): + arg_pair = '' + if (v is not None) and (v is not False): + if k == 'pages': + page_start, page_end = parse_pages(v) + arg_pair = f'start={page_start}&stop={page_end}' + elif k not in ['model', 'full_precision']: + arg_pair = f'{k}={int(v)}' + if arg_pair: + reduce_args.append(arg_pair) + + return '?' + '&'.join(reduce_args) + + @staticmethod + def get_avail_nougat_url(): + import random + import requests + NOUGAT_URLS = get_conf('NOUGAT_URLS') + if len(NOUGAT_URLS) == 0: return None + try: + _nougat_url = random.choice(NOUGAT_URLS) # 随机负载均衡 + if _nougat_url.endswith('/'): _nougat_url = _nougat_url.rstrip('/') + ret = requests.get(_nougat_url + '/') + if ret.status_code == 200: + return _nougat_url + else: + return None + except: + return None def try_install_deps(deps, reload_m=[]): diff --git a/crazy_functions/批量总结PDF文档_初步.py b/crazy_functions/批量总结PDF文档_初步.py index 5628fa1..52fd47b 100644 --- a/crazy_functions/批量总结PDF文档_初步.py +++ b/crazy_functions/批量总结PDF文档_初步.py @@ -29,14 +29,9 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, TOKEN_LIMIT_PER_FRAGMENT = 2500 - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) - page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( - txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT // 4, llm_model=llm_kwargs['llm_model']) # 为了更好的效果,我们剥离Introduction之后的部分(如果有) paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] diff --git a/crazy_functions/批量翻译PDF文档_NOUGAT.py b/crazy_functions/批量翻译PDF文档_NOUGAT.py index 97170d0..3b841a1 100644 --- a/crazy_functions/批量翻译PDF文档_NOUGAT.py +++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py @@ -54,7 +54,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", - "批量翻译PDF文档。函数插件贡献者: Binary-Husky"]) + "批量翻译PDF文档。函数插件贡献者: Binary-Husky,Joshua Reed"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 清空历史,以免输入溢出 @@ -104,11 +104,13 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa DST_LANG = "中文" from crazy_functions.crazy_utils import nougat_interface from crazy_functions.pdf_fns.report_gen_html import construct_html + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + advanced_cfg = plugin_kwargs.get("advanced_arg", '') nougat_handle = nougat_interface() for index, fp in enumerate(file_manifest): if fp.endswith('pdf'): chatbot.append(["当前进度:", f"正在解析论文,请稍候。(第一次运行时,需要花费较长时间下载NOUGAT参数)"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history) + fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history, advanced_cfg=advanced_cfg) promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot) else: chatbot.append(["当前论文无需解析:", fp]); yield from update_ui( chatbot=chatbot, history=history) @@ -123,3 +125,87 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 +@CatchException +def 批量翻译PDF文档_API(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + + disable_auto_promotion(chatbot) + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "使用NOUGAT_API批量翻译PDF文档。函数插件贡献者: Binary-Husky,Joshua Reed。\n" + + "官方版本API仅支持页码范围选择,若要支持更多参数,请移步https://github.com/leike0813/nougat", + ]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 清空历史,以免输入溢出 + history = [] + + from .crazy_utils import get_files_from_everything + success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf') + if len(file_manifest) > 0: + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + success_mmd, file_manifest_mmd, _ = get_files_from_everything(txt, type='.mmd') + success = success or success_mmd + file_manifest += file_manifest_mmd + chatbot.append(["文件列表:", ", ".join([e.split('/')[-1] for e in file_manifest])]); + yield from update_ui(chatbot=chatbot, history=history) + # 检测输入参数,如没有给定输入参数,直接退出 + if not success: + if txt == "": txt = '空空如也的输入栏' + + # 如果没找到任何文件 + if len(file_manifest) == 0: + report_exception(chatbot, history, + a=f"解析项目: {txt}", b=f"找不到任何.pdf拓展名的文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 开始正式执行任务 + yield from 解析PDF_基于NOUGAT_API(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + + +def 解析PDF_基于NOUGAT_API(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + import copy + import tiktoken + TOKEN_LIMIT_PER_FRAGMENT = 1024 + generated_conclusion_files = [] + generated_html_files = [] + DST_LANG = "中文" + from crazy_functions.crazy_utils import nougat_interface + from crazy_functions.pdf_fns.report_gen_html import construct_html + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + advanced_cfg = plugin_kwargs.get("advanced_arg", '') + nougat_handle = nougat_interface() + chatbot.append(["当前进度:", f"正在检查NOUGAT服务可用性..."]); + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + nougat_url = nougat_handle.get_avail_nougat_url() + if nougat_url is None: + report_exception(chatbot, history, + a=f"检查结果:", b="NOUGAT服务不可用,请检查config中的NOUGAT_URL") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + for index, fp in enumerate(file_manifest): + if fp.endswith('pdf'): + chatbot.append(["当前进度:", f"正在解析论文,请稍候。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + fpp = yield from nougat_handle.NOUGAT_API_parse_pdf(fp, chatbot, history, nougat_url, advanced_cfg=advanced_cfg) + promote_file_to_downloadzone(fpp, rename_file=os.path.basename(fpp)+'.nougat.mmd', chatbot=chatbot) + else: + chatbot.append(["当前论文无需解析:", fp]); yield from update_ui(chatbot=chatbot, history=history) + fpp = fp + with open(fpp, 'r', encoding='utf8') as f: + article_content = f.readlines() + article_dict = markdown_to_dict(article_content) + logging.info(article_dict) + yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG) + + chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 \ No newline at end of file