From fa374bf1fc3357ff0ddaed19785290dc377e85a1 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 11 Dec 2023 22:50:19 +0800 Subject: [PATCH 1/4] try full dockerfile with vector store --- .../build-with-all-capacity-beta.yml | 44 +++++++++++++++ docs/GithubAction+AllCapacityBeta | 53 +++++++++++++++++++ docs/GithubAction+NoLocal+Vectordb | 2 +- 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build-with-all-capacity-beta.yml create mode 100644 docs/GithubAction+AllCapacityBeta diff --git a/.github/workflows/build-with-all-capacity-beta.yml b/.github/workflows/build-with-all-capacity-beta.yml new file mode 100644 index 0000000..1f02fed --- /dev/null +++ b/.github/workflows/build-with-all-capacity-beta.yml @@ -0,0 +1,44 @@ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages +name: build-with-all-capacity + +on: + push: + branches: + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}_with_all_capacity_beta + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: true + file: docs/GithubAction+AllCapacityBeta + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/docs/GithubAction+AllCapacityBeta b/docs/GithubAction+AllCapacityBeta new file mode 100644 index 0000000..d3a06ee --- /dev/null +++ b/docs/GithubAction+AllCapacityBeta @@ -0,0 +1,53 @@ +# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacity --network=host --build-arg http_proxy=http://localhost:10881 --build-arg https_proxy=http://localhost:10881 . +# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacityBeta --network=host . +# docker run -it --net=host gpt-academic-all-capacity bash + +# 从NVIDIA源,从而支持显卡(检查宿主的nvidia-smi中的cuda版本必须>=11.3) +FROM fuqingxu/11.3.1-runtime-ubuntu20.04-with-texlive:latest + +# use python3 as the system default python +WORKDIR /gpt +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8 + +# # 非必要步骤,更换pip源 (以下三行,可以删除) +# RUN echo '[global]' > /etc/pip.conf && \ +# echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \ +# echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf + +# 下载pytorch +RUN python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 +# 准备pip依赖 +RUN python3 -m pip install openai numpy arxiv rich +RUN python3 -m pip install colorama Markdown pygments pymupdf +RUN python3 -m pip install python-docx moviepy pdfminer +RUN python3 -m pip install zh_langchain==0.2.1 pypinyin +RUN python3 -m pip install rarfile py7zr +RUN python3 -m pip install aliyun-python-sdk-core==2.13.3 pyOpenSSL webrtcvad scipy git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git +# 下载分支 +WORKDIR /gpt +RUN git clone --depth=1 https://github.com/binary-husky/gpt_academic.git +WORKDIR /gpt/gpt_academic +RUN git clone --depth=1 https://github.com/OpenLMLab/MOSS.git request_llms/moss + +RUN python3 -m pip install -r requirements.txt +RUN python3 -m pip install -r request_llms/requirements_moss.txt +RUN python3 -m pip install -r request_llms/requirements_qwen.txt +RUN python3 -m pip install -r request_llms/requirements_chatglm.txt +RUN python3 -m pip install -r request_llms/requirements_newbing.txt +RUN python3 -m pip install nougat-ocr + +# 预热Tiktoken模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + +# 安装知识库插件的额外依赖 +RUN apt-get update && apt-get install libgl1 -y +RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade +RUN pip3 install unstructured[all-docs] --upgrade +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' +RUN rm -rf /usr/local/lib/python3.8/dist-packages/tests + + +# COPY .cache /root/.cache +# COPY config_private.py config_private.py +# 启动 +CMD ["python3", "-u", "main.py"] diff --git a/docs/GithubAction+NoLocal+Vectordb b/docs/GithubAction+NoLocal+Vectordb index 98595e3..45074d9 100644 --- a/docs/GithubAction+NoLocal+Vectordb +++ b/docs/GithubAction+NoLocal+Vectordb @@ -17,10 +17,10 @@ RUN apt-get update && apt-get install libgl1 -y RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade RUN pip3 install unstructured[all-docs] --upgrade +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' # 可选步骤,用于预热模块 RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' -RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' # 启动 CMD ["python3", "-u", "main.py"] From 8c7569b689c57a633880e29770f8c8a65dd777aa Mon Sep 17 00:00:00 2001 From: binary-husky Date: Thu, 14 Dec 2023 11:00:55 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dprotobuf=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- request_llms/requirements_chatglm_onnx.txt | 2 -- request_llms/requirements_moss.txt | 1 - requirements.txt | 1 + 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/request_llms/requirements_chatglm_onnx.txt b/request_llms/requirements_chatglm_onnx.txt index 5481147..2cd11f6 100644 --- a/request_llms/requirements_chatglm_onnx.txt +++ b/request_llms/requirements_chatglm_onnx.txt @@ -6,5 +6,3 @@ sentencepiece numpy onnxruntime sentencepiece -streamlit -streamlit-chat diff --git a/request_llms/requirements_moss.txt b/request_llms/requirements_moss.txt index c27907c..544b25f 100644 --- a/request_llms/requirements_moss.txt +++ b/request_llms/requirements_moss.txt @@ -5,5 +5,4 @@ accelerate matplotlib huggingface_hub triton -streamlit diff --git a/requirements.txt b/requirements.txt index a5782f7..e253415 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ pypdf2==2.12.1 tiktoken>=0.3.3 requests[socks] pydantic==1.10.11 +protobuf==3.18 transformers>=4.27.1 scipdf_parser>=0.52 python-markdown-math From c181ad38b454ca3705fafd0ddbab348410e3cd7b Mon Sep 17 00:00:00 2001 From: binary-husky <96192199+binary-husky@users.noreply.github.com> Date: Thu, 14 Dec 2023 12:23:49 +0800 Subject: [PATCH 3/4] Update build-with-all-capacity-beta.yml --- .github/workflows/build-with-all-capacity-beta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-with-all-capacity-beta.yml b/.github/workflows/build-with-all-capacity-beta.yml index 1f02fed..5a2a1a5 100644 --- a/.github/workflows/build-with-all-capacity-beta.yml +++ b/.github/workflows/build-with-all-capacity-beta.yml @@ -1,5 +1,5 @@ # https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages -name: build-with-all-capacity +name: build-with-all-capacity-beta on: push: From f4127a9c9c4a7610c5fa6aa9233a5554a46f11e5 Mon Sep 17 00:00:00 2001 From: qingxu fu <505030475@qq.com> Date: Fri, 15 Dec 2023 12:52:21 +0800 Subject: [PATCH 4/4] change clip history policy --- crazy_functions/latex_fns/latex_actions.py | 2 +- toolbox.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index 113a278..b43d7d2 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -404,7 +404,7 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI if modified_pdf_success: - yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面 + yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 正在尝试生成对比PDF, 请稍后 ...', chatbot, history) # 刷新Gradio前端界面 result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path origin_pdf = pj(work_folder_original, f'{main_file_original}.pdf') # get pdf path if os.path.exists(pj(work_folder, '..', 'translation')): diff --git a/toolbox.py b/toolbox.py index 8d91035..bb4ec66 100644 --- a/toolbox.py +++ b/toolbox.py @@ -1007,14 +1007,19 @@ def clip_history(inputs, history, tokenizer, max_token_limit): def get_token_num(txt): return len(tokenizer.encode(txt, disallowed_special=())) input_token_num = get_token_num(inputs) + + if max_token_limit < 5000: output_token_expect = 256 # 4k & 2k models + elif max_token_limit < 9000: output_token_expect = 512 # 8k models + else: output_token_expect = 1024 # 16k & 32k models + if input_token_num < max_token_limit * 3 / 4: # 当输入部分的token占比小于限制的3/4时,裁剪时 # 1. 把input的余量留出来 max_token_limit = max_token_limit - input_token_num # 2. 把输出用的余量留出来 - max_token_limit = max_token_limit - 128 + max_token_limit = max_token_limit - output_token_expect # 3. 如果余量太小了,直接清除历史 - if max_token_limit < 128: + if max_token_limit < output_token_expect: history = [] return history else: