diff --git a/README.md b/README.md index e0d6fd8..4d2f79d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# ChatGPT 学术优化 + + +# ChatGPT 学术优化 **如果喜欢这个项目,请给它一个Star;如果你发明了更好用的快捷键或函数插件,欢迎发issue或者pull requests** @@ -68,11 +70,13 @@ huggingface免科学上网[在线体验](https://huggingface.co/spaces/qingxu98/ -- 多种大语言模型混合调用([v3.1分支](https://github.com/binary-husky/chatgpt_academic/tree/v3.1)测试中) +- 多种大语言模型混合调用(ChatGLM + OpenAI-GPT3.5 + [API2D](https://api2d.com/)-GPT4, [v3.1分支](https://github.com/binary-husky/chatgpt_academic/tree/v3.1)测试中)
- +
+v3.1的[huggingface测试版](https://huggingface.co/spaces/qingxu98/academic-chatgpt-beta)(huggingface版不支持chatglm) + ## 直接运行 (Windows, Linux or MacOS) diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 29c58b4..6085049 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -448,6 +448,7 @@ def read_and_clean_pdf_text(fp): pf = 998 for l in t['lines']: txt_line = "".join([wtf['text'] for wtf in l['spans']]) + if len(txt_line) == 0: continue pf = primary_ffsize(l) meta_line.append([txt_line, pf, l['bbox'], l]) for wtf in l['spans']: # for l in t['lines']: @@ -558,8 +559,8 @@ def read_and_clean_pdf_text(fp): meta_txt = meta_txt.replace('\n', '\n\n') ############################## <第 5 步,展示分割效果> ################################## - for f in finals: - print亮黄(f) - print亮绿('***************************') + # for f in finals: + # print亮黄(f) + # print亮绿('***************************') return meta_txt, page_one_meta diff --git a/crazy_functions/批量翻译PDF文档_多线程.py b/crazy_functions/批量翻译PDF文档_多线程.py index 0d2f1d5..351be0e 100644 --- a/crazy_functions/批量翻译PDF文档_多线程.py +++ b/crazy_functions/批量翻译PDF文档_多线程.py @@ -13,7 +13,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", - "批量总结PDF文档。函数插件贡献者: Binary-Husky"]) + "批量翻译PDF文档。函数插件贡献者: Binary-Husky"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 尝试导入依赖,如果缺少依赖,则给出安装建议 @@ -59,7 +59,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt): import os import tiktoken - TOKEN_LIMIT_PER_FRAGMENT = 1600 + TOKEN_LIMIT_PER_FRAGMENT = 1280 generated_conclusion_files = [] for index, fp in enumerate(file_manifest): @@ -91,13 +91,13 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, # 多线,翻译 gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( inputs_array=[ - f"以下是你需要翻译的论文片段:\n{frag}" for frag in paper_fragments], + f"你需要翻译以下内容:\n{frag}" for frag in paper_fragments], inputs_show_user_array=[f"\n---\n 原文: \n\n {frag.replace('#', '')} \n---\n 翻译:\n " for frag in paper_fragments], llm_kwargs=llm_kwargs, chatbot=chatbot, history_array=[[paper_meta] for _ in paper_fragments], sys_prompt_array=[ - "请你作为一个学术翻译,负责把学术论文的片段准确翻译成中文。" for _ in paper_fragments], + "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in paper_fragments], # max_workers=5 # OpenAI所允许的最大并行过载 ) diff --git a/docs/Dockerfile+ChatGLM b/docs/Dockerfile+ChatGLM index 197ca1a..4a11f06 100644 --- a/docs/Dockerfile+ChatGLM +++ b/docs/Dockerfile+ChatGLM @@ -36,6 +36,9 @@ from transformers import AutoModel, AutoTokenizer \n\ chatglm_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) \n\ chatglm_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float() ' >> warm_up_chatglm.py RUN python3 -u warm_up_chatglm.py + +# 禁用缓存,确保更新代码 +ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache RUN $useProxyNetwork git pull # 为chatgpt-academic配置代理和API-KEY (非必要 可选步骤) diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000..567dee1 Binary files /dev/null and b/docs/logo.png differ diff --git a/main.py b/main.py index 5baf655..723d5d2 100644 --- a/main.py +++ b/main.py @@ -182,7 +182,7 @@ def main(): threading.Thread(target=auto_update, name="self-upgrade", daemon=True).start() auto_opentab_delay() - demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=PORT, auth=AUTHENTICATION) + demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=PORT, auth=AUTHENTICATION, favicon_path="docs/logo.png") if __name__ == "__main__": main() \ No newline at end of file diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py index b5ad5c1..ecc6b1e 100644 --- a/request_llm/bridge_all.py +++ b/request_llm/bridge_all.py @@ -23,6 +23,9 @@ from .bridge_tgui import predict as tgui_ui colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044'] +get_token_num_gpt35 = lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())) +get_token_num_gpt4 = lambda txt: len(tiktoken.encoding_for_model("gpt-4").encode(txt, disallowed_special=())) + model_info = { # openai "gpt-3.5-turbo": { @@ -31,7 +34,7 @@ model_info = { "endpoint": "https://api.openai.com/v1/chat/completions", "max_token": 4096, "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"), - "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())), + "token_cnt": get_token_num_gpt35, }, "gpt-4": { @@ -40,7 +43,7 @@ model_info = { "endpoint": "https://api.openai.com/v1/chat/completions", "max_token": 8192, "tokenizer": tiktoken.encoding_for_model("gpt-4"), - "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-4").encode(txt, disallowed_special=())), + "token_cnt": get_token_num_gpt4, }, # api_2d @@ -50,7 +53,7 @@ model_info = { "endpoint": "https://openai.api2d.net/v1/chat/completions", "max_token": 4096, "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"), - "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())), + "token_cnt": get_token_num_gpt35, }, "api2d-gpt-4": { @@ -59,7 +62,7 @@ model_info = { "endpoint": "https://openai.api2d.net/v1/chat/completions", "max_token": 8192, "tokenizer": tiktoken.encoding_for_model("gpt-4"), - "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-4").encode(txt, disallowed_special=())), + "token_cnt": get_token_num_gpt4, }, # chatglm @@ -69,7 +72,7 @@ model_info = { "endpoint": None, "max_token": 1024, "tokenizer": tiktoken.encoding_for_model("gpt-3.5-turbo"), - "token_cnt": lambda txt: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(txt, disallowed_special=())), + "token_cnt": get_token_num_gpt35, }, } diff --git a/toolbox.py b/toolbox.py index dfd3d1d..05fd368 100644 --- a/toolbox.py +++ b/toolbox.py @@ -401,7 +401,7 @@ def on_file_uploaded(files, chatbot, txt, txt2, checkboxes): chatbot.append(['我上传了文件,请查收', f'[Local Message] 收到以下文件: \n\n{moved_files_str}' + f'\n\n调用路径参数已自动修正到: \n\n{txt}' + - f'\n\n现在您点击任意实验功能时,以上文件将被作为输入参数'+err_msg]) + f'\n\n现在您点击任意“红颜色”标识的函数插件时,以上文件将被作为输入参数'+err_msg]) return chatbot, txt, txt2