From ed11269aefbc20be97a2b6b6a2a6bf0728467a50 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 28 Aug 2023 01:22:20 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=80=9F=E5=8A=A9GROBID?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0PDF=E9=AB=98=E7=B2=BE=E5=BA=A6=E7=BF=BB?= =?UTF-8?q?=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 28 +++- crazy_functions/pdf_fns/parse_pdf.py | 25 ++++ crazy_functions/批量翻译PDF文档_多线程.py | 150 ++++++++++++++++++---- requirements.txt | 3 +- tests/test_plugins.py | 6 +- 5 files changed, 177 insertions(+), 35 deletions(-) create mode 100644 crazy_functions/pdf_fns/parse_pdf.py diff --git a/config.py b/config.py index f82891c..876a164 100644 --- a/config.py +++ b/config.py @@ -70,8 +70,10 @@ MAX_RETRY = 2 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 ) LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓ -AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"] -# P.S. 其他可用的模型还包括 ["qianfan", "llama2", "qwen", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "spark", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] +AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", + "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"] +# P.S. 其他可用的模型还包括 ["qianfan", "llama2", "qwen", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", +# "spark", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_pangualpha", "jittorllms_llama"] # 百度千帆(LLM_MODEL="qianfan") @@ -162,6 +164,14 @@ CUSTOM_API_KEY_PATTERN = "" HUGGINGFACE_ACCESS_TOKEN = "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" +# GROBID服务器地址(填写多个可以均衡负载),用于高质量地读取PDF文档 +# 获取方法:复制以下空间https://huggingface.co/spaces/qingxu98/grobid,设为public,然后GROBID_URL = "https://(你的hf用户名如qingxu98)-(你的填写的空间名如grobid).hf.space" +GROBID_URLS = [ + "https://qingxu98-grobid.hf.space","https://qingxu98-grobid2.hf.space","https://qingxu98-grobid3.hf.space", + "https://shaocongma-grobid.hf.space","https://FBR123-grobid.hf.space", +] + + """ 在线大模型配置关联关系示意图 @@ -205,9 +215,13 @@ HUGGINGFACE_ACCESS_TOKEN = "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" 插件在线服务配置依赖关系示意图 │ ├── 语音功能 - ├── ENABLE_AUDIO - ├── ALIYUN_TOKEN - ├── ALIYUN_APPKEY - ├── ALIYUN_ACCESSKEY - └── ALIYUN_SECRET +│ ├── ENABLE_AUDIO +│ ├── ALIYUN_TOKEN +│ ├── ALIYUN_APPKEY +│ ├── ALIYUN_ACCESSKEY +│ └── ALIYUN_SECRET +│ +├── PDF文档精准解析 +│ └── GROBID_URLS + """ \ No newline at end of file diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py new file mode 100644 index 0000000..00016be --- /dev/null +++ b/crazy_functions/pdf_fns/parse_pdf.py @@ -0,0 +1,25 @@ +import requests +import random +from functools import lru_cache +class GROBID_OFFLINE_EXCEPTION(Exception): pass + +def get_avail_grobid_url(): + from toolbox import get_conf + GROBID_URLS, = get_conf('GROBID_URLS') + if len(GROBID_URLS) == 0: return None + try: + _grobid_url = random.choice(GROBID_URLS) # 随机负载均衡 + if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') + res = requests.get(_grobid_url+'/api/isalive') + if res.text=='true': return _grobid_url + else: return None + except: + return None + +@lru_cache(maxsize=32) +def parse_pdf(pdf_path, grobid_url): + import scipdf # pip install scipdf_parser + if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/') + article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) + return article_dict + diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 0adac96..e0558e9 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -1,15 +1,19 @@ from toolbox import CatchException, report_execption, write_results_to_file -from toolbox import update_ui, promote_file_to_downloadzone +from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion +from toolbox import write_history_to_file, get_log_folder from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency from .crazy_utils import read_and_clean_pdf_text +from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url from colorful import * +import glob +import os +import math @CatchException -def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port): - import glob - import os +def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + disable_auto_promotion(chatbot) # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", @@ -30,20 +34,11 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ # 清空历史,以免输入溢出 history = [] + from .crazy_utils import get_files_from_everything + success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf') # 检测输入参数,如没有给定输入参数,直接退出 - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": - txt = '空空如也的输入栏' - report_execption(chatbot, history, - a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 搜索需要处理的文件清单 - file_manifest = [f for f in glob.glob( - f'{project_folder}/**/*.pdf', recursive=True)] + if not success: + if txt == "": txt = '空空如也的输入栏' # 如果没找到任何文件 if len(file_manifest) == 0: @@ -53,22 +48,130 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ return # 开始正式执行任务 - yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt) + grobid_url = get_avail_grobid_url() + if grobid_url is not None: + yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url) + else: + yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3) + yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) -def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt): - import os +def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url): import copy import tiktoken TOKEN_LIMIT_PER_FRAGMENT = 1280 generated_conclusion_files = [] generated_html_files = [] + DST_LANG = "中文" for index, fp in enumerate(file_manifest): + chatbot.append(["当前进度:", f"正在连接GROBID服务,请稍候: {grobid_url}\n如果等待时间过长,请修改config中的GROBID_URL,可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + article_dict = parse_pdf(fp, grobid_url) + print(article_dict) + prompt = "以下是一篇学术论文的基本信息:\n" + # title + title = article_dict.get('title', '无法获取 title'); prompt += f'title:{title}\n\n' + # authors + authors = article_dict.get('authors', '无法获取 authors'); prompt += f'authors:{authors}\n\n' + # abstract + abstract = article_dict.get('abstract', '无法获取 abstract'); prompt += f'abstract:{abstract}\n\n' + # command + prompt += f"请将题目和摘要翻译为{DST_LANG}。" + meta = [f'# Title:\n\n', title, f'# Abstract:\n\n', abstract ] + # 单线,获取文章meta信息 + paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, + inputs_show_user=prompt, + llm_kwargs=llm_kwargs, + chatbot=chatbot, history=[], + sys_prompt="You are an academic paper reader。", + ) + + # 多线,翻译 + inputs_array = [] + inputs_show_user_array = [] + + # get_token_num + from request_llm.bridge_all import model_info + enc = model_info[llm_kwargs['llm_model']]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + + def break_down(txt): + raw_token_num = get_token_num(txt) + if raw_token_num <= TOKEN_LIMIT_PER_FRAGMENT: + return [txt] + else: + # raw_token_num > TOKEN_LIMIT_PER_FRAGMENT + # find a smooth token limit to achieve even seperation + count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT)) + token_limit_smooth = raw_token_num // count + count + return breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn=get_token_num, limit=token_limit_smooth) + + for section in article_dict.get('sections'): + if len(section['text']) == 0: continue + section_frags = break_down(section['text']) + for i, fragment in enumerate(section_frags): + heading = section['heading'] + if len(section_frags) > 1: heading += f'Part-{i+1}' + inputs_array.append( + f"你需要翻译{heading}章节,内容如下: \n\n{fragment}" + ) + inputs_show_user_array.append( + f"# {heading}\n\n{fragment}" + ) + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[meta for _ in inputs_array], + sys_prompt_array=[ + "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array], + ) + res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=None, file_fullname=None) + promote_file_to_downloadzone(res_path, rename_file=os.path.basename(fp)+'.md', chatbot=chatbot) + generated_conclusion_files.append(res_path) + + ch = construct_html() + orig = "" + trans = "" + gpt_response_collection_html = copy.deepcopy(gpt_response_collection) + for i,k in enumerate(gpt_response_collection_html): + if i%2==0: + gpt_response_collection_html[i] = inputs_show_user_array[i//2] + else: + gpt_response_collection_html[i] = gpt_response_collection_html[i] + + final = ["", "", "一、论文概况", "", "Abstract", paper_meta_info, "二、论文翻译", ""] + final.extend(gpt_response_collection_html) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{os.path.basename(fp)}.trans.html" + html_file = ch.save_file(create_report_file_name) + generated_html_files.append(html_file) + promote_file_to_downloadzone(html_file, rename_file=os.path.basename(html_file), chatbot=chatbot) + + chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + +def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + import copy + TOKEN_LIMIT_PER_FRAGMENT = 1280 + generated_conclusion_files = [] + generated_html_files = [] + for index, fp in enumerate(file_manifest): # 读取PDF文件 file_content, page_one = read_and_clean_pdf_text(fp) file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + # 递归地切割PDF文件 from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf from request_llm.bridge_all import model_info @@ -140,8 +243,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, trans = k ch.add_row(a=orig, b=trans) create_report_file_name = f"{os.path.basename(fp)}.trans.html" - ch.save_file(create_report_file_name) - generated_html_files.append(f'./gpt_log/{create_report_file_name}') + generated_html_files.append(ch.save_file(create_report_file_name)) except: from toolbox import trimmed_format_exc print('writing html result failed:', trimmed_format_exc()) @@ -202,6 +304,6 @@ class construct_html(): def save_file(self, file_name): - with open(f'./gpt_log/{file_name}', 'w', encoding='utf8') as f: + with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f: f.write(self.html_string.encode('utf-8', 'ignore').decode()) - + return os.path.join(get_log_folder(), file_name) diff --git a/requirements.txt b/requirements.txt index 92dc477..e6d27d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,5 +18,6 @@ openai numpy arxiv rich -websocket-client pypdf2==2.12.1 +websocket-client +scipdf_parser==0.3 diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 53969bf..4913a59 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -9,9 +9,9 @@ validate_path() # 返回项目根路径 from tests.test_utils import plugin_test if __name__ == "__main__": - plugin_test(plugin='crazy_functions.命令行助手->命令行助手', main_input='查看当前的docker容器列表') + # plugin_test(plugin='crazy_functions.命令行助手->命令行助手', main_input='查看当前的docker容器列表') - plugin_test(plugin='crazy_functions.解析项目源代码->解析一个Python项目', main_input="crazy_functions/test_project/python/dqn") + # plugin_test(plugin='crazy_functions.解析项目源代码->解析一个Python项目', main_input="crazy_functions/test_project/python/dqn") # plugin_test(plugin='crazy_functions.解析项目源代码->解析一个C项目', main_input="crazy_functions/test_project/cpp/cppipc") @@ -19,7 +19,7 @@ if __name__ == "__main__": # plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown中译英', main_input="README.md") - # plugin_test(plugin='crazy_functions.批量翻译PDF文档_多线程->批量翻译PDF文档', main_input="crazy_functions/test_project/pdf_and_word") + plugin_test(plugin='crazy_functions.批量翻译PDF文档_多线程->批量翻译PDF文档', main_input='crazy_functions/test_project/pdf_and_word/aaai.pdf') # plugin_test(plugin='crazy_functions.谷歌检索小助手->谷歌检索小助手', main_input="https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=auto+reinforcement+learning&btnG=")