From 0cb7dd5280081dbc2bb3548c1d8542e967602d9b Mon Sep 17 00:00:00 2001 From: binary-husky Date: Fri, 8 Dec 2023 22:22:01 +0800 Subject: [PATCH] test vector store on docker --- Dockerfile | 12 +++++---- check_proxy.py | 8 ++++++ crazy_functions/vector_fns/vector_database.py | 2 +- crazy_functions/知识库问答.py | 2 +- docs/GithubAction+NoLocal+Vectordb | 26 +++++++++++++++++++ 5 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 docs/GithubAction+NoLocal+Vectordb diff --git a/Dockerfile b/Dockerfile index fe9579b..f51befa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,11 +23,13 @@ RUN pip3 install -r requirements.txt # 装载项目文件,安装剩余依赖(必要) -RUN pip3 install torch --index-url https://download.pytorch.org/whl/cpu -RUN pip3 install langchain sentence-transformers unstructured[local-inference] faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk - -COPY .cache /root/.cache COPY . . RUN pip3 install -r requirements.txt + + +# 非必要步骤,用于预热模块(可以删除) +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + + # 启动(必要) -CMD ["python3", "-u", "tests/test_vector_plugins.py"] +CMD ["python3", "-u", "main.py"] diff --git a/check_proxy.py b/check_proxy.py index e2ba3f1..2df8185 100644 --- a/check_proxy.py +++ b/check_proxy.py @@ -159,7 +159,15 @@ def warm_up_modules(): enc.encode("模块预热", disallowed_special=()) enc = model_info["gpt-4"]['tokenizer'] enc.encode("模块预热", disallowed_special=()) + +def warm_up_vectordb(): + print('正在执行一些模块的预热 ...') + from toolbox import ProxyNetworkActivate + with ProxyNetworkActivate("Warmup_Modules"): + import nltk + with ProxyNetworkActivate("Warmup_Modules"): nltk.download("punkt") + if __name__ == '__main__': import os os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染 diff --git a/crazy_functions/vector_fns/vector_database.py b/crazy_functions/vector_fns/vector_database.py index def2ccc..cffa22c 100644 --- a/crazy_functions/vector_fns/vector_database.py +++ b/crazy_functions/vector_fns/vector_database.py @@ -242,7 +242,7 @@ class LocalDocQA: def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_conent, one_content_segmentation, text2vec): for file in files: - assert os.path.exists(file), "输入文件不存在" + assert os.path.exists(file), "输入文件不存在:" + file import nltk if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path local_doc_qa = LocalDocQA() diff --git a/crazy_functions/知识库问答.py b/crazy_functions/知识库问答.py index b6ddb65..e1cd00c 100644 --- a/crazy_functions/知识库问答.py +++ b/crazy_functions/知识库问答.py @@ -51,7 +51,7 @@ def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst for sp in spl: _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') file_manifest += file_manifest_tmp - + if len(file_manifest) == 0: chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 diff --git a/docs/GithubAction+NoLocal+Vectordb b/docs/GithubAction+NoLocal+Vectordb new file mode 100644 index 0000000..98595e3 --- /dev/null +++ b/docs/GithubAction+NoLocal+Vectordb @@ -0,0 +1,26 @@ +# 此Dockerfile适用于“无本地模型”的环境构建,如果需要使用chatglm等本地模型,请参考 docs/Dockerfile+ChatGLM +# 如何构建: 先修改 `config.py`, 然后 docker build -t gpt-academic-nolocal-vs -f docs/GithubAction+NoLocal+Vectordb . +# 如何运行: docker run --rm -it --net=host gpt-academic-nolocal-vs +FROM python:3.11 + +# 指定路径 +WORKDIR /gpt + +# 装载项目文件 +COPY . . + +# 安装依赖 +RUN pip3 install -r requirements.txt + +# 安装知识库插件的额外依赖 +RUN apt-get update && apt-get install libgl1 -y +RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu +RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade +RUN pip3 install unstructured[all-docs] --upgrade + +# 可选步骤,用于预热模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' + +# 启动 +CMD ["python3", "-u", "main.py"]