Merge pull request #1009 from ValeriaWong/master

feat(chatglm_int8_onnx):纯CPU推理，最多仅需8GB内存，推理速度未测评，token数有限，暂时还不能流式输出 #…
2023-08-07 01:13:09 +08:00 · 2023-08-07 01:13:09 +08:00 · 9bee676cd2
commit 9bee676cd2
parent 43809c107d 0a37106692
9 changed files with 537 additions and 142 deletions
--- a/.gitignore
+++ b/.gitignore
@ -151,3 +151,4 @@ multi-language
 request_llm/moss
 media
 flagged
 request_llm/ChatGLM-6b-onnx-u8s8
--- a/config.py
+++ b/config.py
@ -71,7 +71,7 @@ MAX_RETRY = 2
 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
 LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
 AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
-# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
+# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
 # ChatGLM(2) Finetune Model Path （如果使用ChatGLM2微调模型，需要把"chatglmft"加入AVAIL_LLM_MODELS中）
--- a/request_llm/bridge_all.py
+++ b/request_llm/bridge_all.py
@ -19,9 +19,6 @@ from .bridge_chatgpt import predict as chatgpt_ui
 from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
 from .bridge_chatglm import predict as chatglm_ui
 # from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
 # from .bridge_tgui import predict as tgui_ui
 colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
 class LazyloadTiktoken(object):
@ -322,6 +319,22 @@ if "internlm" in AVAIL_LLM_MODELS:
        })
    except:
        print(trimmed_format_exc())
 if "chatglm_onnx" in AVAIL_LLM_MODELS:
    try:
        from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui
        from .bridge_chatglmonnx import predict as chatglm_onnx_ui
        model_info.update({
            "chatglm_onnx": {
                "fn_with_ui": chatglm_onnx_ui,
                "fn_without_ui": chatglm_onnx_noui,
                "endpoint": None,
                "max_token": 4096,
                "tokenizer": tokenizer_gpt35,
                "token_cnt": get_token_num_gpt35,
            }
        })
    except:
        print(trimmed_format_exc())
 def LLM_CATCH_EXCEPTION(f):
    """
--- a/request_llm/bridge_chatglmonnx.py
+++ b/request_llm/bridge_chatglmonnx.py
@ -0,0 +1,73 @@
 model_name = "ChatGLM-ONNX"
 cmd_to_install = "`pip install request_llm/requirements_chatglm_onnx.txt`"
 from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
 from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
 from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
 from .chatglmoonx import ChatGLMModel, chat_template
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
@SingletonLocalLLM
 class GetONNXGLMHandle(LocalLLMHandle):
    def load_model_info(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        self.model_name = model_name
        self.cmd_to_install = cmd_to_install
    def load_model_and_tokenizer(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        import os, glob
        if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件
            from huggingface_hub import snapshot_download
            snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8")
        def create_model():
            return ChatGLMModel(
                tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model",
                onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
            )
        self._model = create_model()
        return self._model, None
    def llm_stream_generator(self, **kwargs):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        def adaptor(kwargs):
            query = kwargs['query']
            max_length = kwargs['max_length']
            top_p = kwargs['top_p']
            temperature = kwargs['temperature']
            history = kwargs['history']
            return query, max_length, top_p, temperature, history
        query, max_length, top_p, temperature, history = adaptor(kwargs)
        prompt = chat_template(history, query)
        for answer in self._model.generate_iterate(
            prompt,
            max_generated_tokens=max_length,
            top_k=1,
            top_p=top_p,
            temperature=temperature,
        ):
            yield answer
    def try_to_import_special_deps(self, **kwargs):
        # import something that will raise error if the user does not install requirement_*.txt
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        pass
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 GPT-Academic Interface
 # ------------------------------------------------------------------------------------------------------------------------
 predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
--- a/request_llm/bridge_internlm.py
+++ b/request_llm/bridge_internlm.py
@ -1,23 +1,25 @@
 model_name = "InternLM"
 cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`"
 from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
-from toolbox import update_ui, get_conf, Singleton
+from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
 from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
-model_name = "InternLM"
+
-cmd_to_install = "`pip install ???`"
+# ------------------------------------------------------------------------------------------------------------------------
-load_message = f"{model_name}尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，{model_name}消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
+# 🔌💻 Local Model Utils
 # ------------------------------------------------------------------------------------------------------------------------
 def try_to_import_special_deps():
    import sentencepiece
 user_prompt = "<|User|>:{user}<eoh>\n"
 robot_prompt = "<|Bot|>:{robot}<eoa>\n"
 cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
 def combine_history(prompt, hist):
    user_prompt = "<|User|>:{user}<eoh>\n"
    robot_prompt = "<|Bot|>:{robot}<eoa>\n"
    cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
    messages = hist
    total_prompt = ""
    for message in messages:
@ -29,24 +31,22 @@ def combine_history(prompt, hist):
    total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
    return total_prompt
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
@SingletonLocalLLM
 class GetInternlmHandle(LocalLLMHandle):
-@Singleton
+    def load_model_info(self):
-class GetInternlmHandle(Process):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
-    def __init__(self):
+        self.model_name = model_name
-        # ⭐主进程执行
+        self.cmd_to_install = cmd_to_install
        super().__init__(daemon=True)
        self.parent, self.child = Pipe()
        self._model = None
        self._tokenizer = None
        self.info = ""
        self.success = True
        self.check_dependency()
        self.start()
        self.threadLock = threading.Lock()
-    def ready(self):
+    def try_to_import_special_deps(self, **kwargs):
-        # ⭐主进程执行
+        """
-        return self._model is not None
+        import something that will raise error if the user does not install requirement_*.txt
        """
        import sentencepiece
    def load_model_and_tokenizer(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
@ -196,117 +196,7 @@ class GetInternlmHandle(Process):
                return
    def check_dependency(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        try:
            try_to_import_special_deps()
            self.info = "依赖检测通过"
            self.success = True
        except:
            self.info = f"缺少{model_name}的依赖，如果要使用{model_name}，除了基础的pip依赖以外，您还需要运行{cmd_to_install}安装{model_name}的依赖。"
            self.success = False
    def run(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        # 第一次运行，加载参数
        try:
            self._model, self._tokenizer = self.load_model_and_tokenizer()
        except:
            from toolbox import trimmed_format_exc
            self.child.send(f'[Local Message] 不能正常加载{model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
            raise RuntimeError(f"不能正常加载{model_name}的参数！")
        while True:
            # 进入任务等待状态
            kwargs = self.child.recv()
            # 收到消息，开始请求
            try:
                for response_full in self.llm_stream_generator(**kwargs):
                    self.child.send(response_full)
            except:
                from toolbox import trimmed_format_exc
                self.child.send(f'[Local Message] 调用{model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
            # 请求处理结束，开始下一个循环
            self.child.send('[Finish]')
    def stream_chat(self, **kwargs):
        # ⭐主进程执行
        self.threadLock.acquire()
        self.parent.send(kwargs)
        while True:
            res = self.parent.recv()
            if res != '[Finish]':
                yield res
            else:
                break
        self.threadLock.release()
 # ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 GPT-Academic
+# 🔌💻 GPT-Academic Interface
 # ------------------------------------------------------------------------------------------------------------------------
-def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name)
    """
        ⭐多线程方法
        函数的说明请见 request_llm/bridge_all.py
    """
    _llm_handle = GetInternlmHandle()
    if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
    if not _llm_handle.success: 
        error = _llm_handle.info
        _llm_handle = None
        raise RuntimeError(error)
    # chatglm 没有 sys_prompt 接口，因此把prompt加入 history
    history_feedin = []
    history_feedin.append(["What can I do?", sys_prompt])
    for i in range(len(history)//2):
        history_feedin.append([history[2*i], history[2*i+1]] )
    watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
    response = ""
    for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
        if len(observe_window) >= 1:  observe_window[0] = response
        if len(observe_window) >= 2:  
            if (time.time()-observe_window[1]) > watch_dog_patience:
                raise RuntimeError("程序终止。")
    return response
 def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
    """
        ⭐单线程方法
        函数的说明请见 request_llm/bridge_all.py
    """
    chatbot.append((inputs, ""))
    _llm_handle = GetInternlmHandle()
    chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
    yield from update_ui(chatbot=chatbot, history=[])
    if not _llm_handle.success: 
        _llm_handle = None
        return
    if additional_fn is not None:
        from core_functional import handle_core_functionality
        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
    # 处理历史信息
    history_feedin = []
    history_feedin.append(["What can I do?", system_prompt] )
    for i in range(len(history)//2):
        history_feedin.append([history[2*i], history[2*i+1]] )
    # 开始接收chatglm的回复
    response = f"[Local Message]: 等待{model_name}响应中 ..."
    for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
        chatbot[-1] = (inputs, response)
        yield from update_ui(chatbot=chatbot, history=history)
    # 总结输出
    if response == f"[Local Message]: 等待{model_name}响应中 ...":
        response = f"[Local Message]: {model_name}响应异常 ..."
    history.extend([inputs, response])
    yield from update_ui(chatbot=chatbot, history=history)
--- a/request_llm/chatglmoonx.py
+++ b/request_llm/chatglmoonx.py
@ -0,0 +1,229 @@
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
 # ------------------------------------------------------------------------------------------------------------------------
 import re
 import numpy as np
 # import torch
 from onnxruntime import InferenceSession, SessionOptions
 # Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
 # although they are documented as supported on CUDA.
 providers = ["CPUExecutionProvider"]
 # if torch.cuda.is_available():
 #     providers = ["CUDAExecutionProvider"] + providers
 # Default paths
 tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
 onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
 # input & output names
 past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
 present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
 output_names = ["logits"] + present_names
 # default kv_cache for first inference
 default_past_key_values = {
    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
 }
 def chat_template(history: list[tuple[str, str]], current: str):
    prompt = ""
    chat_round = 0
    for question, answer in history:
        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
        chat_round += 1
    prompt += f"[Round {chat_round}]\n问：{current}\n答："
    return prompt
 def process_response(response: str):
    response = response.strip()
    response = response.replace("[[训练时间]]", "2023年")
    punkts = [
        [",", "，"],
        ["!", "！"],
        [":", "："],
        [";", "；"],
        ["\?", "？"],
    ]
    for item in punkts:
        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
    return response
 class ChatGLMModel():
    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
        options = SessionOptions()
        options.enable_profiling = profile
        self.session = InferenceSession(onnx_model_path, options, providers=providers)
        self.eop_token_id = self.tokenizer["<eop>"]
    def prepare_input(self, prompt: str):
        input_ids, prefix_mask = self.tokenizer.encode(prompt)
        input_ids = np.array([input_ids], dtype=np.longlong)
        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
        return input_ids, prefix_mask, default_past_key_values
    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
        # softmax with temperature
        exp_logits = np.exp(logits / temperature)
        probs = exp_logits / np.sum(exp_logits)
        # top k
        top_k_idx = np.argsort(-probs)[:top_k]
        top_k_probs = probs[top_k_idx]
        # top p
        cumsum_probs = np.cumsum(top_k_probs)
        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
        top_k_probs = top_k_probs / np.sum(top_k_probs)
        # sample
        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
        return next_token[0].item()
    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
        output_tokens = []
        while True:
            inputs = {
                "input_ids": input_ids,
                "prefix_mask": prefix_mask,
                "use_past": np.array(len(output_tokens) > 0),
            }
            inputs.update(past_key_values)
            logits, *past_key_values = self.session.run(output_names, inputs)
            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
            output_tokens += [next_token]
            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
                break
            input_ids = np.array([[next_token]], dtype=np.longlong)
            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
            yield process_response(self.tokenizer.decode(output_tokens))
        return process_response(self.tokenizer.decode(output_tokens))
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
 # ------------------------------------------------------------------------------------------------------------------------
 import re
 from sentencepiece import SentencePieceProcessor
 def replace_spaces_with_blank(match: re.Match[str]):
    return f"<|blank_{len(match.group())}|>"
 def replace_blank_with_spaces(match: re.Match[str]):
    return " " * int(match.group(1))
 class ChatGLMTokenizer:
    def __init__(self, vocab_file):
        assert vocab_file is not None
        self.vocab_file = vocab_file
        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
    def __len__(self):
        return len(self.text_tokenizer)
    def __getitem__(self, key: str):
        return self.text_tokenizer[key]
    def preprocess(self, text: str, linebreak=True, whitespaces=True):
        if linebreak:
            text = text.replace("\n", "<n>")
        if whitespaces:
            text = text.replace("\t", "<|tab|>")
            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
        return text
    def encode(
        self, text: str, text_pair: str = None,
        linebreak=True, whitespaces=True,
        add_dummy_prefix=True, special_tokens=True,
    ) -> tuple[list[int], list[int]]:
        """
        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
        text_pair: causal LM part.
        linebreak: Whether to encode newline (\n) in text.
        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
        add_dummy_prefix: Whether to add dummy blank space in the beginning.
        """
        text = self.preprocess(text, linebreak, whitespaces)
        if not add_dummy_prefix:
            text = "<n>" + text
        tokens = self.text_tokenizer.encode(text)
        prefix_mask = [1] * len(tokens)
        if special_tokens:
            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
            prefix_mask += [1, 0]
        if text_pair is not None:
            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
            pair_tokens = self.text_tokenizer.encode(text_pair)
            tokens += pair_tokens
            prefix_mask += [0] * len(pair_tokens)
            if special_tokens:
                tokens += [self.text_tokenizer["<eop>"]]
                prefix_mask += [0]
        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
    def decode(self, text_ids: list[int]) -> str:
        text = self.text_tokenizer.decode(text_ids)
        text = text.replace("<n>", "\n")
        text = text.replace("<|tab|>", "\t")
        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
        return text
--- a/request_llm/local_llm_class.py
+++ b/request_llm/local_llm_class.py
@ -0,0 +1,178 @@
 from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
 from toolbox import update_ui, get_conf, Singleton
 from multiprocessing import Process, Pipe
 def SingletonLocalLLM(cls):
    """
    一个单实例装饰器
    """
    _instance = {}
    def _singleton(*args, **kargs):
        if cls not in _instance:
            _instance[cls] = cls(*args, **kargs)
            return _instance[cls]
        elif _instance[cls].corrupted:
            _instance[cls] = cls(*args, **kargs)
            return _instance[cls]
        else:
            return _instance[cls]
    return _singleton
 class LocalLLMHandle(Process):
    def __init__(self):
        # ⭐主进程执行
        super().__init__(daemon=True)
        self.corrupted = False
        self.load_model_info()
        self.parent, self.child = Pipe()
        self.running = True
        self._model = None
        self._tokenizer = None
        self.info = ""
        self.check_dependency()
        self.start()
        self.threadLock = threading.Lock()
    def load_model_info(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        raise NotImplementedError("Method not implemented yet")
        self.model_name = ""
        self.cmd_to_install = ""
    def load_model_and_tokenizer(self):
        """
        This function should return the model and the tokenizer
        """
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        raise NotImplementedError("Method not implemented yet")
    def llm_stream_generator(self, **kwargs):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        raise NotImplementedError("Method not implemented yet")
    def try_to_import_special_deps(self, **kwargs):
        """
        import something that will raise error if the user does not install requirement_*.txt
        """
        # ⭐主进程执行
        raise NotImplementedError("Method not implemented yet")
    def check_dependency(self):
        # ⭐主进程执行
        try:
            self.try_to_import_special_deps()
            self.info = "依赖检测通过"
            self.running = True
        except:
            self.info = f"缺少{self.model_name}的依赖，如果要使用{self.model_name}，除了基础的pip依赖以外，您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
            self.running = False
    def run(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        # 第一次运行，加载参数
        try:
            self._model, self._tokenizer = self.load_model_and_tokenizer()
        except:
            self.running = False
            from toolbox import trimmed_format_exc
            self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
            self.child.send('[FinishBad]')
            raise RuntimeError(f"不能正常加载{self.model_name}的参数！")
        while True:
            # 进入任务等待状态
            kwargs = self.child.recv()
            # 收到消息，开始请求
            try:
                for response_full in self.llm_stream_generator(**kwargs):
                    self.child.send(response_full)
                self.child.send('[Finish]')
                # 请求处理结束，开始下一个循环
            except:
                from toolbox import trimmed_format_exc
                self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
                self.child.send('[Finish]')
    def stream_chat(self, **kwargs):
        # ⭐主进程执行
        self.threadLock.acquire()
        self.parent.send(kwargs)
        while True:
            res = self.parent.recv()
            if res == '[Finish]': 
                break
            if res == '[FinishBad]': 
                self.running = False
                self.corrupted = True
                break
            else: 
                yield res
        self.threadLock.release()
 def get_local_llm_predict_fns(LLMSingletonClass, model_name):
    load_message = f"{model_name}尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，{model_name}消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
    def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
        """
            ⭐多线程方法
            函数的说明请见 request_llm/bridge_all.py
        """
        _llm_handle = LLMSingletonClass()
        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
        # chatglm 没有 sys_prompt 接口，因此把prompt加入 history
        history_feedin = []
        history_feedin.append(["What can I do?", sys_prompt])
        for i in range(len(history)//2):
            history_feedin.append([history[2*i], history[2*i+1]] )
        watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
        response = ""
        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
            if len(observe_window) >= 1:
                observe_window[0] = response
            if len(observe_window) >= 2:  
                if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
        return response
    def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
        """
            ⭐单线程方法
            函数的说明请见 request_llm/bridge_all.py
        """
        chatbot.append((inputs, ""))
        _llm_handle = LLMSingletonClass()
        chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
        yield from update_ui(chatbot=chatbot, history=[])
        if additional_fn is not None:
            from core_functional import handle_core_functionality
            inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
        # 处理历史信息
        history_feedin = []
        history_feedin.append(["What can I do?", system_prompt] )
        for i in range(len(history)//2):
            history_feedin.append([history[2*i], history[2*i+1]] )
        # 开始接收回复
        response = f"[Local Message]: 等待{model_name}响应中 ..."
        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
            chatbot[-1] = (inputs, response)
            yield from update_ui(chatbot=chatbot, history=history)
        # 总结输出
        if response == f"[Local Message]: 等待{model_name}响应中 ...":
            response = f"[Local Message]: {model_name}响应异常 ..."
        history.extend([inputs, response])
        yield from update_ui(chatbot=chatbot, history=history)
    return predict_no_ui_long_connection, predict
--- a/request_llm/requirements_chatglm.txt
+++ b/request_llm/requirements_chatglm.txt
@ -1,5 +1,5 @@
 protobuf
-transformers==4.27.1
+transformers>=4.27.1
 cpm_kernels
 torch>=1.10
 mdtex2html
--- a/request_llm/requirements_chatglm_onnx.txt
+++ b/request_llm/requirements_chatglm_onnx.txt
@ -0,0 +1,11 @@
 protobuf
 transformers>=4.27.1
 cpm_kernels
 torch>=1.10
 mdtex2html
 sentencepiece
 numpy
 onnxruntime
 sentencepiece
 streamlit
 streamlit-chat