UTF8 Ignore read file errors

2023-04-12 16:57:01 +08:00 · 2023-04-12 16:57:01 +08:00 · 40bd857c70
commit 40bd857c70
parent 88a86635c6
8 changed files with 55 additions and 8 deletions
--- a/47
+++ b/47
@ -0,0 +1,47 @@
 # How to build | 如何构建: docker build -t gpt-academic --network=host  -f Dockerfile+ChatGLM .
 # How to run | 如何运行 (1) 直接运行: docker run --rm -it --net=host gpt-academic
 # How to run | 如何运行 (2) 我想运行之前进容器做一些调整: docker run --rm -it --net=host --gpu=all gpt-academic bash
 # 从NVIDIA源，从而支持显卡运损（检查宿主的nvidia-smi中的cuda版本必须>=11.3）
 FROM nvidia/cuda:11.3.1-runtime-ubuntu20.04
 ARG useProxyNetwork=''
 RUN apt-get update
 RUN apt-get install -y curl proxychains curl 
 RUN apt-get install -y git python python3 python-dev python3-dev --fix-missing
 # 配置代理网络（构建Docker镜像时使用）
 # # comment out below if you do not need proxy network | 如果不需要翻墙 - 从此行向下删除
 RUN $useProxyNetwork curl cip.cc
 RUN sed -i '$ d' /etc/proxychains.conf
 RUN sed -i '$ d' /etc/proxychains.conf
 RUN echo "socks5 127.0.0.1 10880" >> /etc/proxychains.conf
 ARG useProxyNetwork=proxychains
 # # comment out above if you do not need proxy network | 如果不需要翻墙 - 从此行向上删除
 # use python3 as the system default python
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8
 # 下载分支
 WORKDIR /gpt
 RUN $useProxyNetwork git clone https://github.com/binary-husky/chatgpt_academic.git -b v3.0
 WORKDIR /gpt/chatgpt_academic
 RUN $useProxyNetwork python3 -m pip install -r requirements.txt
 RUN $useProxyNetwork python3 -m pip install -r request_llm/requirements_chatglm.txt
 RUN $useProxyNetwork python3 -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu113
 # 为chatgpt-academic配置代理和API-KEY （非必要 可选步骤）
 RUN echo ' \n\
 API_KEY = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \n\
 USE_PROXY = True \n\
 proxies = { "http": "socks5h://localhost:10880", "https": "socks5h://localhost:10880", } ' >> config_private.py
 # 预热CHATGLM参数（非必要 可选步骤）
 RUN echo ' \n\
 from transformers import AutoModel, AutoTokenizer \n\
 chatglm_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) \n\
 chatglm_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float() ' >> warm_up_chatglm.py
 RUN python3 -u warm_up_chatglm.py
 # 启动
 CMD ["python3", "-u", "main.py"]
--- a/crazy_functions/Latex全文润色.py
+++ b/crazy_functions/Latex全文润色.py
@ -45,7 +45,7 @@ def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
    pfg = PaperFileGroup()
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
            # 定义注释的正则表达式
            comment_pattern = r'%.*'
--- a/crazy_functions/Latex全文翻译.py
+++ b/crazy_functions/Latex全文翻译.py
@ -44,7 +44,7 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
    pfg = PaperFileGroup()
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
            # 定义注释的正则表达式
            comment_pattern = r'%.*'
--- a/crazy_functions/代码重写为全英文_多线程.py
+++ b/crazy_functions/代码重写为全英文_多线程.py
@ -49,7 +49,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
    # 第4步：随便显示点什么防止卡顿的感觉
    for index, fp in enumerate(file_manifest):
        # if 'test_project' in fp: continue
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
        i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文，只输出转化后的英文代码，请用代码块输出代码: {os.path.abspath(fp)}'
        i_say_show_user_buffer.append(i_say_show_user)
@ -72,7 +72,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
        if index > 10: 
            time.sleep(60)
            print('Openai 限制免费用户每分钟20次请求，降低请求频率中。')
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
        i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文，只输出代码，文件名是{fp}，文件代码是 ```{file_content}```'
        try:
--- a/crazy_functions/批量总结PDF文档pdfminer.py
+++ b/crazy_functions/批量总结PDF文档pdfminer.py
@ -68,7 +68,7 @@ def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbo
    print('begin analysis on:', file_manifest)
    for index, fp in enumerate(file_manifest):
        if ".tex" in fp:
-            with open(fp, 'r', encoding='utf-8') as f:
+            with open(fp, 'r', encoding='utf-8', errors='replace') as f:
                file_content = f.read()
        if ".pdf" in fp.lower():
            file_content = readPdf(fp)
--- a/crazy_functions/生成函数注释.py
+++ b/crazy_functions/生成函数注释.py
@ -7,7 +7,7 @@ def 生成函数注释(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
    import time, os
    print('begin analysis on:', file_manifest)
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
        i_say = f'请对下面的程序文件做一个概述，并对文件中的所有函数生成注释，使用markdown表格输出结果，文件名是{os.path.relpath(fp, project_folder)}，文件内容是 ```{file_content}```'
--- a/crazy_functions/解析项目源代码.py
+++ b/crazy_functions/解析项目源代码.py
@ -14,7 +14,7 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
    ############################## <第一步，逐个文件分析，多线程> ##################################
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
        prefix = "接下来请你逐文件分析下面的工程" if index==0 else ""
        i_say = prefix + f'请对下面的程序文件做一个概述文件名是{os.path.relpath(fp, project_folder)}，文件代码是 ```{file_content}```'
--- a/crazy_functions/读文章写摘要.py
+++ b/crazy_functions/读文章写摘要.py
@ -8,7 +8,7 @@ def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbo
    import time, glob, os
    print('begin analysis on:', file_manifest)
    for index, fp in enumerate(file_manifest):
-        with open(fp, 'r', encoding='utf-8') as f:
+        with open(fp, 'r', encoding='utf-8', errors='replace') as f:
            file_content = f.read()
        prefix = "接下来请你逐文件分析下面的论文文件，概括其内容" if index==0 else ""