From c0c337988fa5c154676cdcd52a9f8b6780ce0f58 Mon Sep 17 00:00:00 2001
From: ValeriaWong <763717425@qq.com>
Date: Tue, 1 Aug 2023 00:48:57 +0800
Subject: [PATCH 1/4] =?UTF-8?q?feat(chatglm=5Fint8=5Fonnx):=E7=BA=AFCPU?=
 =?UTF-8?q?=E6=8E=A8=E7=90=86=EF=BC=8C=E6=9C=80=E5=A4=9A=E4=BB=85=E9=9C=80?=
 =?UTF-8?q?8GB=E5=86=85=E5=AD=98=EF=BC=8C=E6=8E=A8=E7=90=86=E9=80=9F?=
 =?UTF-8?q?=E5=BA=A6=E6=9C=AA=E6=B5=8B=E8=AF=84=EF=BC=8Ctoken=E6=95=B0?=
 =?UTF-8?q?=E6=9C=89=E9=99=90=EF=BC=8C=E6=9A=82=E6=97=B6=E8=BF=98=E4=B8=8D?=
 =?UTF-8?q?=E8=83=BD=E6=B5=81=E5=BC=8F=E8=BE=93=E5=87=BA=20#1008?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                                 |   2 +-
 request_llm/bridge_all.py                 |  11 +-
 request_llm/bridge_chatglm_onnx.py        | 354 ++++++++++++++++++++++
 request_llm/requirements_chatglm_onnx.txt |  11 +
 4 files changed, 376 insertions(+), 2 deletions(-)
 create mode 100644 request_llm/bridge_chatglm_onnx.py
 create mode 100644 request_llm/requirements_chatglm_onnx.txt

diff --git a/config.py b/config.py
index 0234bb4..659589f 100644
--- a/config.py
+++ b/config.py
@@ -70,7 +70,7 @@ MAX_RETRY = 2
 
 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
 LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
-AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
+AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm","chatglm_onnx","moss", "newbing", "stack-claude"]
 # P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
 
 
diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py
index c40c2e7..75e448e 100644
--- a/request_llm/bridge_all.py
+++ b/request_llm/bridge_all.py
@@ -19,6 +19,8 @@ from .bridge_chatgpt import predict as chatgpt_ui
 from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
 from .bridge_chatglm import predict as chatglm_ui
 
+from .bridge_chatglm_onnx import predict_no_ui_long_connection as chatglm_onnx_noui
+from .bridge_chatglm_onnx import predict as chatglm_onnx_ui
 # from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
 # from .bridge_tgui import predict as tgui_ui
 
@@ -164,7 +166,14 @@ model_info = {
         "tokenizer": tokenizer_gpt35,
         "token_cnt": get_token_num_gpt35,
     },
-
+    "chatglm_onnx": {
+        "fn_with_ui": chatglm_onnx_ui,
+        "fn_without_ui": chatglm_onnx_noui,
+        "endpoint": None,
+        "max_token": 1024,
+        "tokenizer": tokenizer_gpt35,
+        "token_cnt": get_token_num_gpt35,
+    },
 }
 
 
diff --git a/request_llm/bridge_chatglm_onnx.py b/request_llm/bridge_chatglm_onnx.py
new file mode 100644
index 0000000..636b38d
--- /dev/null
+++ b/request_llm/bridge_chatglm_onnx.py
@@ -0,0 +1,354 @@
+import re
+import threading
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+import numpy as np
+from onnxruntime import InferenceSession, SessionOptions
+from sentencepiece import SentencePieceProcessor
+
+
+# 模型来源 K024/ChatGLM-6b-onnx-u8s8
+
+global glm_onnx_handle
+
+
+glm_onnx_handle = None
+load_message = "ChatGLM_onnx尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，ChatGLM_onnx消耗大量的内存（CPU）或显存（GPU），也许会导致低配(内存<8GB）计算机卡死 ……"
+
+# Default paths
+tokenizer_path = "YOUR/TOKENIZER_PATH/sentencepiece.model"
+onnx_model_path = "YOUR/TOKENIZER_PATH/chatglm-6b-int8.onnx"
+
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+
+
+#################################################################################
+class GetGLMHandle(Process):
+
+    def __init__(self):
+        super().__init__(daemon=True)
+        self.parent, self.child = Pipe()
+        self.ChatGLM_onnx_model = None # tokenizer_path
+        self.ChatGLM_onnx_tokenizer = None # onnx_model_path
+        self.info = ""
+        self.success = True
+        self.check_dependency()
+        self.start()
+        self.threadLock = threading.Lock()
+            
+    def check_dependency(self):
+        try:
+            import sentencepiece
+            self.info = "依赖检测通过"
+            self.success = True
+        except:
+            self.info = "缺少ChatGLM_onnx的依赖，如果要使用ChatGLM_onnx，除了基础的pip依赖以外，您还需要运行`pip install -r request_llm/requirements_ChatGLM_onnx.txt`安装ChatGLM_onnx的依赖。"
+            self.success = False
+
+    def ready(self):
+        return self.ChatGLM_onnx_model is not None
+    
+
+    def run(self):
+        # 子进程执行
+        # 第一次运行，加载参数
+        retry = 0
+        while True:
+            try:
+                if self.ChatGLM_onnx_model is None:
+                   # Initialize the ChatGLMModel and ChatGLMTokenizer
+                    self.ChatGLM_onnx_model = ChatGLMModel()
+                    self.ChatGLM_onnx_tokenizer = ChatGLMTokenizer()
+                    break
+                else:
+                    break
+            except:
+                retry += 1
+                if retry > 3: 
+                    self.child.send('[Local Message] Call ChatGLM_onnx fail 不能正常加载ChatGLM_onnx的参数。')
+                    raise RuntimeError("不能正常加载ChatGLM_onnx的参数！")
+
+        while True:
+            # 进入任务等待状态
+            kwargs = self.child.recv()
+            # 收到消息，开始请求
+            try:
+                # Use the ChatGLMModel and ChatGLMTokenizer to generate a response
+                response = tuple(self.ChatGLM_onnx_model.generate_iterate(kwargs['query']))
+                
+                # Send the output data
+                self.child.send(response[-1])
+            except:
+                from toolbox import trimmed_format_exc
+                self.child.send('[Local Message] Call ChatGLM_onnx fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+            # 请求处理结束，开始下一个循环
+            self.child.send('[Finish]')
+
+        
+
+    def stream_chat(self, **kwargs):
+        # 主进程执行
+        self.threadLock.acquire()
+        self.parent.send(kwargs)
+        while True:
+            res = self.parent.recv()
+            if res != '[Finish]':
+                yield res
+            else:
+                break
+        self.threadLock.release()
+    
+
+#################################################################################
+class ChatGLMModel():
+
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+        # input & output names
+        self.past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+        self.present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+        self.output_names = ["logits"] + self.present_names
+
+        # default kv_cache for first inference
+        self.default_past_key_values = {
+            k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in self.past_names
+        }
+
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+
+        return input_ids, prefix_mask, self.default_past_key_values
+
+
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+
+
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+
+            logits, *past_key_values = self.session.run(self.output_names, inputs)
+            past_key_values = { k: v for k, v in zip(self.past_names, past_key_values) }
+
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            
+            output_tokens += [next_token]
+
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+
+            yield process_response(self.tokenizer.decode(output_tokens))
+
+        return process_response(self.tokenizer.decode(output_tokens))
+
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+
+    def __len__(self):
+        return len(self.text_tokenizer)
+
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+
+
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\\n", "<n>")
+        if whitespaces:
+            text = text.replace("\\t", "<|tab|>")
+            text = re.sub(r" {2,80}", self.replace_spaces_with_blank, text)
+        return text
+
+
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+
+        if text_pair is not None:
+            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+
+
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", self.replace_blank_with_spaces, text)
+        return text
+    def replace_spaces_with_blank(match: re.Match[str]):
+        return f"<|blank_{len(match.group())}|>"
+    
+    def replace_blank_with_spaces(match: re.Match[str]):
+        return " " * int(match.group(1))
+
+#################################################################################
+
+
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+
+#################################################################################
+
+
+def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+    """
+    多线程方法
+    函数的说明请见 request_llm/bridge_all.py
+    """
+    if glm_onnx_handle is None:
+        glm_onnx_handle = GetGLMHandle()
+        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glm_onnx_handle.info
+        if not glm_onnx_handle.success:
+            error = glm_onnx_handle.info
+            glm_onnx_handle = None
+            raise RuntimeError(error)
+
+    # ChatGLM_onnx doesn't have a sys_prompt interface, so add the prompt to history
+    history_feedin = []
+    history_feedin.append(["What can I do?", sys_prompt])
+    for i in range(len(history) // 2):
+        history_feedin.append([history[2 * i], history[2 * i + 1]])
+
+    watch_dog_patience = 5  # Watchdog patience, set to 5 seconds
+    response = ""
+    for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin):
+        print(response)
+        if len(observe_window) >= 1:
+            observe_window[0] = response
+        if len(observe_window) >= 2:
+            if (time.time() - observe_window[1]) > watch_dog_patience:
+                raise RuntimeError("程序终止。")
+    return response
+
+def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream=True, additional_fn=None):
+    """
+    单线程方法
+    函数的说明请见 request_llm/bridge_all.py
+    """
+    chatbot.append((inputs, ""))
+
+    global glm_onnx_handle
+    if glm_onnx_handle is None:
+        glm_onnx_handle = GetGLMHandle()
+        chatbot[-1] = (inputs, load_message + "\n\n" + glm_onnx_handle.info)
+        yield from update_ui(chatbot=chatbot, history=[])
+        if not glm_onnx_handle.success:
+            glm_onnx_handle = None
+            return
+
+    if additional_fn is not None:
+        import core_functional
+        importlib.reload(core_functional)  # Hot-reload prompt
+        core_functional = core_functional.get_core_functions()
+        if "PreProcess" in core_functional[additional_fn]:
+            inputs = core_functional[additional_fn]["PreProcess"](inputs)
+        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+
+    history_feedin = []
+    history_feedin.append(["What can I do?", system_prompt])
+    for i in range(len(history) // 2):
+        history_feedin.append([history[2 * i], history[2 * i + 1]])
+
+    response = "[Local Message]: 等待ChatGLM_onnx响应中 ..."
+    for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin):
+        chatbot[-1] = (inputs, response)
+        yield from update_ui(chatbot=chatbot, history=history)
+
+    if response == "[Local Message]: 等待ChatGLM_onnx响应中 ...":
+        response = "[Local Message]: ChatGLM_onnx响应异常 ..."
+    history.extend([inputs, response])
+    yield from update_ui(chatbot=chatbot, history=history)
+
+
+
+
diff --git a/request_llm/requirements_chatglm_onnx.txt b/request_llm/requirements_chatglm_onnx.txt
new file mode 100644
index 0000000..de072bd
--- /dev/null
+++ b/request_llm/requirements_chatglm_onnx.txt
@@ -0,0 +1,11 @@
+protobuf
+transformers==4.27.1
+cpm_kernels
+torch>=1.10
+mdtex2html
+sentencepiece
+numpy
+onnxruntime
+sentencepiece
+streamlit
+streamlit-chat

From d7dd586f09f9a0bb013612e0eac4107c2ca3d2b1 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 7 Aug 2023 00:57:52 +0800
Subject: [PATCH 2/4] introduce unified base class for local llm models

---
 .gitignore                                |   1 +
 config.py                                 |   4 +-
 request_llm/bridge_all.py                 |  30 +-
 request_llm/bridge_chatglm_onnx.py        | 354 ----------------------
 request_llm/bridge_chatglmonnx.py         | 308 +++++++++++++++++++
 request_llm/bridge_internlm.py            | 164 ++--------
 request_llm/local_llm_class.py            | 178 +++++++++++
 request_llm/requirements_chatglm.txt      |   2 +-
 request_llm/requirements_chatglm_onnx.txt |   2 +-
 9 files changed, 535 insertions(+), 508 deletions(-)
 delete mode 100644 request_llm/bridge_chatglm_onnx.py
 create mode 100644 request_llm/bridge_chatglmonnx.py
 create mode 100644 request_llm/local_llm_class.py

diff --git a/.gitignore b/.gitignore
index 55c4db1..c4df287 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,3 +151,4 @@ multi-language
 request_llm/moss
 media
 flagged
+request_llm/ChatGLM-6b-onnx-u8s8
diff --git a/config.py b/config.py
index 659589f..c12d718 100644
--- a/config.py
+++ b/config.py
@@ -70,8 +70,8 @@ MAX_RETRY = 2
 
 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
 LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
-AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm","chatglm_onnx","moss", "newbing", "stack-claude"]
-# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
+AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "internlm", "moss", "newbing", "stack-claude"]
+# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
 
 
 # ChatGLM(2) Finetune Model Path （如果使用ChatGLM2微调模型，需要把"chatglmft"加入AVAIL_LLM_MODELS中）
diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py
index 75e448e..f38711d 100644
--- a/request_llm/bridge_all.py
+++ b/request_llm/bridge_all.py
@@ -19,11 +19,6 @@ from .bridge_chatgpt import predict as chatgpt_ui
 from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
 from .bridge_chatglm import predict as chatglm_ui
 
-from .bridge_chatglm_onnx import predict_no_ui_long_connection as chatglm_onnx_noui
-from .bridge_chatglm_onnx import predict as chatglm_onnx_ui
-# from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
-# from .bridge_tgui import predict as tgui_ui
-
 colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
 
 class LazyloadTiktoken(object):
@@ -166,14 +161,7 @@ model_info = {
         "tokenizer": tokenizer_gpt35,
         "token_cnt": get_token_num_gpt35,
     },
-    "chatglm_onnx": {
-        "fn_with_ui": chatglm_onnx_ui,
-        "fn_without_ui": chatglm_onnx_noui,
-        "endpoint": None,
-        "max_token": 1024,
-        "tokenizer": tokenizer_gpt35,
-        "token_cnt": get_token_num_gpt35,
-    },
+
 }
 
 
@@ -331,6 +319,22 @@ if "internlm" in AVAIL_LLM_MODELS:
         })
     except:
         print(trimmed_format_exc())
+if "chatglm_onnx" in AVAIL_LLM_MODELS:
+    try:
+        from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui
+        from .bridge_chatglmonnx import predict as chatglm_onnx_ui
+        model_info.update({
+            "chatglm_onnx": {
+                "fn_with_ui": chatglm_onnx_ui,
+                "fn_without_ui": chatglm_onnx_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
 
 def LLM_CATCH_EXCEPTION(f):
     """
diff --git a/request_llm/bridge_chatglm_onnx.py b/request_llm/bridge_chatglm_onnx.py
deleted file mode 100644
index 636b38d..0000000
--- a/request_llm/bridge_chatglm_onnx.py
+++ /dev/null
@@ -1,354 +0,0 @@
-import re
-import threading
-from toolbox import update_ui, get_conf
-from multiprocessing import Process, Pipe
-import numpy as np
-from onnxruntime import InferenceSession, SessionOptions
-from sentencepiece import SentencePieceProcessor
-
-
-# 模型来源 K024/ChatGLM-6b-onnx-u8s8
-
-global glm_onnx_handle
-
-
-glm_onnx_handle = None
-load_message = "ChatGLM_onnx尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，ChatGLM_onnx消耗大量的内存（CPU）或显存（GPU），也许会导致低配(内存<8GB）计算机卡死 ……"
-
-# Default paths
-tokenizer_path = "YOUR/TOKENIZER_PATH/sentencepiece.model"
-onnx_model_path = "YOUR/TOKENIZER_PATH/chatglm-6b-int8.onnx"
-
-# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
-# although they are documented as supported on CUDA.
-providers = ["CPUExecutionProvider"]
-
-# if torch.cuda.is_available():
-#     providers = ["CUDAExecutionProvider"] + providers
-
-
-#################################################################################
-class GetGLMHandle(Process):
-
-    def __init__(self):
-        super().__init__(daemon=True)
-        self.parent, self.child = Pipe()
-        self.ChatGLM_onnx_model = None # tokenizer_path
-        self.ChatGLM_onnx_tokenizer = None # onnx_model_path
-        self.info = ""
-        self.success = True
-        self.check_dependency()
-        self.start()
-        self.threadLock = threading.Lock()
-            
-    def check_dependency(self):
-        try:
-            import sentencepiece
-            self.info = "依赖检测通过"
-            self.success = True
-        except:
-            self.info = "缺少ChatGLM_onnx的依赖，如果要使用ChatGLM_onnx，除了基础的pip依赖以外，您还需要运行`pip install -r request_llm/requirements_ChatGLM_onnx.txt`安装ChatGLM_onnx的依赖。"
-            self.success = False
-
-    def ready(self):
-        return self.ChatGLM_onnx_model is not None
-    
-
-    def run(self):
-        # 子进程执行
-        # 第一次运行，加载参数
-        retry = 0
-        while True:
-            try:
-                if self.ChatGLM_onnx_model is None:
-                   # Initialize the ChatGLMModel and ChatGLMTokenizer
-                    self.ChatGLM_onnx_model = ChatGLMModel()
-                    self.ChatGLM_onnx_tokenizer = ChatGLMTokenizer()
-                    break
-                else:
-                    break
-            except:
-                retry += 1
-                if retry > 3: 
-                    self.child.send('[Local Message] Call ChatGLM_onnx fail 不能正常加载ChatGLM_onnx的参数。')
-                    raise RuntimeError("不能正常加载ChatGLM_onnx的参数！")
-
-        while True:
-            # 进入任务等待状态
-            kwargs = self.child.recv()
-            # 收到消息，开始请求
-            try:
-                # Use the ChatGLMModel and ChatGLMTokenizer to generate a response
-                response = tuple(self.ChatGLM_onnx_model.generate_iterate(kwargs['query']))
-                
-                # Send the output data
-                self.child.send(response[-1])
-            except:
-                from toolbox import trimmed_format_exc
-                self.child.send('[Local Message] Call ChatGLM_onnx fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
-            # 请求处理结束，开始下一个循环
-            self.child.send('[Finish]')
-
-        
-
-    def stream_chat(self, **kwargs):
-        # 主进程执行
-        self.threadLock.acquire()
-        self.parent.send(kwargs)
-        while True:
-            res = self.parent.recv()
-            if res != '[Finish]':
-                yield res
-            else:
-                break
-        self.threadLock.release()
-    
-
-#################################################################################
-class ChatGLMModel():
-
-    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
-        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
-        options = SessionOptions()
-        options.enable_profiling = profile
-        self.session = InferenceSession(onnx_model_path, options, providers=providers)
-        self.eop_token_id = self.tokenizer["<eop>"]
-        # input & output names
-        self.past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-        self.present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-        self.output_names = ["logits"] + self.present_names
-
-        # default kv_cache for first inference
-        self.default_past_key_values = {
-            k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in self.past_names
-        }
-
-    def prepare_input(self, prompt: str):
-        input_ids, prefix_mask = self.tokenizer.encode(prompt)
-
-        input_ids = np.array([input_ids], dtype=np.longlong)
-        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
-
-        return input_ids, prefix_mask, self.default_past_key_values
-
-
-    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
-        # softmax with temperature
-        exp_logits = np.exp(logits / temperature)
-        probs = exp_logits / np.sum(exp_logits)
-
-        # top k
-        top_k_idx = np.argsort(-probs)[:top_k]
-        top_k_probs = probs[top_k_idx]
-
-        # top p
-        cumsum_probs = np.cumsum(top_k_probs)
-        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
-        top_k_probs = top_k_probs / np.sum(top_k_probs)
-
-        # sample
-        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
-        return next_token[0].item()
-
-
-    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
-        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
-        output_tokens = []
-
-        while True:
-            inputs = {
-                "input_ids": input_ids,
-                "prefix_mask": prefix_mask,
-                "use_past": np.array(len(output_tokens) > 0),
-            }
-            inputs.update(past_key_values)
-
-            logits, *past_key_values = self.session.run(self.output_names, inputs)
-            past_key_values = { k: v for k, v in zip(self.past_names, past_key_values) }
-
-            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
-            
-            output_tokens += [next_token]
-
-            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
-                break
-
-            input_ids = np.array([[next_token]], dtype=np.longlong)
-            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
-
-            yield process_response(self.tokenizer.decode(output_tokens))
-
-        return process_response(self.tokenizer.decode(output_tokens))
-
-class ChatGLMTokenizer:
-    def __init__(self, vocab_file):
-        assert vocab_file is not None
-        self.vocab_file = vocab_file
-        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
-        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
-
-    def __len__(self):
-        return len(self.text_tokenizer)
-
-    def __getitem__(self, key: str):
-        return self.text_tokenizer[key]
-
-
-    def preprocess(self, text: str, linebreak=True, whitespaces=True):
-        if linebreak:
-            text = text.replace("\\n", "<n>")
-        if whitespaces:
-            text = text.replace("\\t", "<|tab|>")
-            text = re.sub(r" {2,80}", self.replace_spaces_with_blank, text)
-        return text
-
-
-    def encode(
-        self, text: str, text_pair: str = None,
-        linebreak=True, whitespaces=True,
-        add_dummy_prefix=True, special_tokens=True,
-    ) -> tuple[list[int], list[int]]:
-        """
-        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
-        text_pair: causal LM part.
-        linebreak: Whether to encode newline (\n) in text.
-        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self.preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-
-        tokens = self.text_tokenizer.encode(text)
-        prefix_mask = [1] * len(tokens)
-        if special_tokens:
-            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
-            prefix_mask += [1, 0]
-
-        if text_pair is not None:
-            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
-            pair_tokens = self.text_tokenizer.encode(text_pair)
-            tokens += pair_tokens
-            prefix_mask += [0] * len(pair_tokens)
-            if special_tokens:
-                tokens += [self.text_tokenizer["<eop>"]]
-                prefix_mask += [0]
-
-        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
-
-
-    def decode(self, text_ids: list[int]) -> str:
-        text = self.text_tokenizer.decode(text_ids)
-        text = text.replace("<n>", "\n")
-        text = text.replace("<|tab|>", "\t")
-        text = re.sub(r"<\|blank_(\d\d?)\|>", self.replace_blank_with_spaces, text)
-        return text
-    def replace_spaces_with_blank(match: re.Match[str]):
-        return f"<|blank_{len(match.group())}|>"
-    
-    def replace_blank_with_spaces(match: re.Match[str]):
-        return " " * int(match.group(1))
-
-#################################################################################
-
-
-def chat_template(history: list[tuple[str, str]], current: str):
-    prompt = ""
-    chat_round = 0
-    for question, answer in history:
-        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
-        chat_round += 1
-    prompt += f"[Round {chat_round}]\n问：{current}\n答："
-    return prompt
-
-def process_response(response: str):
-    response = response.strip()
-    response = response.replace("[[训练时间]]", "2023年")
-    punkts = [
-        [",", "，"],
-        ["!", "！"],
-        [":", "："],
-        [";", "；"],
-        ["\?", "？"],
-    ]
-    for item in punkts:
-        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-    return response
-
-#################################################################################
-
-
-def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
-    """
-    多线程方法
-    函数的说明请见 request_llm/bridge_all.py
-    """
-    if glm_onnx_handle is None:
-        glm_onnx_handle = GetGLMHandle()
-        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glm_onnx_handle.info
-        if not glm_onnx_handle.success:
-            error = glm_onnx_handle.info
-            glm_onnx_handle = None
-            raise RuntimeError(error)
-
-    # ChatGLM_onnx doesn't have a sys_prompt interface, so add the prompt to history
-    history_feedin = []
-    history_feedin.append(["What can I do?", sys_prompt])
-    for i in range(len(history) // 2):
-        history_feedin.append([history[2 * i], history[2 * i + 1]])
-
-    watch_dog_patience = 5  # Watchdog patience, set to 5 seconds
-    response = ""
-    for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin):
-        print(response)
-        if len(observe_window) >= 1:
-            observe_window[0] = response
-        if len(observe_window) >= 2:
-            if (time.time() - observe_window[1]) > watch_dog_patience:
-                raise RuntimeError("程序终止。")
-    return response
-
-def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream=True, additional_fn=None):
-    """
-    单线程方法
-    函数的说明请见 request_llm/bridge_all.py
-    """
-    chatbot.append((inputs, ""))
-
-    global glm_onnx_handle
-    if glm_onnx_handle is None:
-        glm_onnx_handle = GetGLMHandle()
-        chatbot[-1] = (inputs, load_message + "\n\n" + glm_onnx_handle.info)
-        yield from update_ui(chatbot=chatbot, history=[])
-        if not glm_onnx_handle.success:
-            glm_onnx_handle = None
-            return
-
-    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)  # Hot-reload prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]:
-            inputs = core_functional[additional_fn]["PreProcess"](inputs)
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
-
-    history_feedin = []
-    history_feedin.append(["What can I do?", system_prompt])
-    for i in range(len(history) // 2):
-        history_feedin.append([history[2 * i], history[2 * i + 1]])
-
-    response = "[Local Message]: 等待ChatGLM_onnx响应中 ..."
-    for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin):
-        chatbot[-1] = (inputs, response)
-        yield from update_ui(chatbot=chatbot, history=history)
-
-    if response == "[Local Message]: 等待ChatGLM_onnx响应中 ...":
-        response = "[Local Message]: ChatGLM_onnx响应异常 ..."
-    history.extend([inputs, response])
-    yield from update_ui(chatbot=chatbot, history=history)
-
-
-
-
diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py
new file mode 100644
index 0000000..4d9844a
--- /dev/null
+++ b/request_llm/bridge_chatglmonnx.py
@@ -0,0 +1,308 @@
+model_name = "ChatGLM-ONNX"
+cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`"
+
+
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
+# ------------------------------------------------------------------------------------------------------------------------
+import re
+import numpy as np
+# import torch
+from onnxruntime import InferenceSession, SessionOptions
+
+
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+
+
+# Default paths
+tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
+onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+
+
+# input & output names
+past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+output_names = ["logits"] + present_names
+
+
+# default kv_cache for first inference
+default_past_key_values = {
+    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
+}
+
+
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+
+
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+
+
+class ChatGLMModel():
+
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+
+
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+
+        return input_ids, prefix_mask, default_past_key_values
+
+
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+
+
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+
+            logits, *past_key_values = self.session.run(output_names, inputs)
+            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
+
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            
+            output_tokens += [next_token]
+
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+
+            yield process_response(self.tokenizer.decode(output_tokens))
+
+        return process_response(self.tokenizer.decode(output_tokens))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
+# ------------------------------------------------------------------------------------------------------------------------
+
+import re
+from sentencepiece import SentencePieceProcessor
+
+
+def replace_spaces_with_blank(match: re.Match[str]):
+    return f"<|blank_{len(match.group())}|>"
+
+
+def replace_blank_with_spaces(match: re.Match[str]):
+    return " " * int(match.group(1))
+
+
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+
+    def __len__(self):
+        return len(self.text_tokenizer)
+
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+
+
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = text.replace("\t", "<|tab|>")
+            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
+        return text
+
+
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+
+        if text_pair is not None:
+            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+
+
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
+        return text
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model
+# ------------------------------------------------------------------------------------------------------------------------
+@SingletonLocalLLM
+class GetONNXGLMHandle(LocalLLMHandle):
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        self.model_name = model_name
+        self.cmd_to_install = cmd_to_install
+
+    def load_model_and_tokenizer(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        import os, glob
+        if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件
+            from huggingface_hub import snapshot_download
+            snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8")
+        def create_model():
+            return ChatGLMModel(
+                tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model",
+                onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+            )
+        self._model = create_model()
+        return self._model, None
+
+    def llm_stream_generator(self, **kwargs):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        def adaptor(kwargs):
+            model = self._model
+            tokenizer = self._tokenizer
+            prompt = kwargs['query']
+            max_length = kwargs['max_length']
+            top_p = kwargs['top_p']
+            temperature = kwargs['temperature']
+            history = kwargs['history']
+            real_prompt = combine_history(prompt, history)
+            return model, tokenizer, real_prompt, max_length, top_p, temperature
+
+        model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs)
+
+        prompt = chat_template(history, question)
+        for answer in self._model.generate_iterate(
+            prompt,
+            max_generated_tokens=max_length,
+            top_k=1,
+            top_p=top_p,
+            temperature=temperature,
+        ):
+            yield answer
+        
+    def try_to_import_special_deps(self, **kwargs):
+        # import something that will raise error if the user does not install requirement_*.txt
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        pass
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 GPT-Academic Interface
+# ------------------------------------------------------------------------------------------------------------------------
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
\ No newline at end of file
diff --git a/request_llm/bridge_internlm.py b/request_llm/bridge_internlm.py
index a0ba3ba..804edc8 100644
--- a/request_llm/bridge_internlm.py
+++ b/request_llm/bridge_internlm.py
@@ -1,23 +1,25 @@
+model_name = "InternLM"
+cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`"
 
 from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
-from toolbox import update_ui, get_conf, Singleton
+from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
 
-model_name = "InternLM"
-cmd_to_install = "`pip install ???`"
-load_message = f"{model_name}尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，{model_name}消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model Utils
+# ------------------------------------------------------------------------------------------------------------------------
 def try_to_import_special_deps():
     import sentencepiece
 
-user_prompt = "<|User|>:{user}<eoh>\n"
-robot_prompt = "<|Bot|>:{robot}<eoa>\n"
-cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
-
-
 def combine_history(prompt, hist):
+    user_prompt = "<|User|>:{user}<eoh>\n"
+    robot_prompt = "<|Bot|>:{robot}<eoa>\n"
+    cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
     messages = hist
     total_prompt = ""
     for message in messages:
@@ -29,24 +31,22 @@ def combine_history(prompt, hist):
     total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
     return total_prompt
 
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model
+# ------------------------------------------------------------------------------------------------------------------------
+@SingletonLocalLLM
+class GetInternlmHandle(LocalLLMHandle):
 
-@Singleton
-class GetInternlmHandle(Process):
-    def __init__(self):
-        # ⭐主进程执行
-        super().__init__(daemon=True)
-        self.parent, self.child = Pipe()
-        self._model = None
-        self._tokenizer = None
-        self.info = ""
-        self.success = True
-        self.check_dependency()
-        self.start()
-        self.threadLock = threading.Lock()
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        self.model_name = model_name
+        self.cmd_to_install = cmd_to_install
 
-    def ready(self):
-        # ⭐主进程执行
-        return self._model is not None
+    def try_to_import_special_deps(self, **kwargs):
+        """
+        import something that will raise error if the user does not install requirement_*.txt
+        """
+        import sentencepiece
 
     def load_model_and_tokenizer(self):
         # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
@@ -195,118 +195,8 @@ class GetInternlmHandle(Process):
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 return
 
-
-
-    def check_dependency(self):
-        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
-        try:
-            try_to_import_special_deps()
-            self.info = "依赖检测通过"
-            self.success = True
-        except:
-            self.info = f"缺少{model_name}的依赖，如果要使用{model_name}，除了基础的pip依赖以外，您还需要运行{cmd_to_install}安装{model_name}的依赖。"
-            self.success = False
-
-    def run(self):
-        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
-        # 第一次运行，加载参数
-        try:
-            self._model, self._tokenizer = self.load_model_and_tokenizer()
-        except:
-            from toolbox import trimmed_format_exc
-            self.child.send(f'[Local Message] 不能正常加载{model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
-            raise RuntimeError(f"不能正常加载{model_name}的参数！")
-
-        while True:
-            # 进入任务等待状态
-            kwargs = self.child.recv()
-            # 收到消息，开始请求
-            try:
-                for response_full in self.llm_stream_generator(**kwargs):
-                    self.child.send(response_full)
-            except:
-                from toolbox import trimmed_format_exc
-                self.child.send(f'[Local Message] 调用{model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
-            # 请求处理结束，开始下一个循环
-            self.child.send('[Finish]')
-
-    def stream_chat(self, **kwargs):
-        # ⭐主进程执行
-        self.threadLock.acquire()
-        self.parent.send(kwargs)
-        while True:
-            res = self.parent.recv()
-            if res != '[Finish]':
-                yield res
-            else:
-                break
-        self.threadLock.release()
-    
     
 # ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 GPT-Academic
+# 🔌💻 GPT-Academic Interface
 # ------------------------------------------------------------------------------------------------------------------------
-def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
-    """
-        ⭐多线程方法
-        函数的说明请见 request_llm/bridge_all.py
-    """
-    _llm_handle = GetInternlmHandle()
-    if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
-    if not _llm_handle.success: 
-        error = _llm_handle.info
-        _llm_handle = None
-        raise RuntimeError(error)
-
-    # chatglm 没有 sys_prompt 接口，因此把prompt加入 history
-    history_feedin = []
-    history_feedin.append(["What can I do?", sys_prompt])
-    for i in range(len(history)//2):
-        history_feedin.append([history[2*i], history[2*i+1]] )
-
-    watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
-    response = ""
-    for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
-        if len(observe_window) >= 1:  observe_window[0] = response
-        if len(observe_window) >= 2:  
-            if (time.time()-observe_window[1]) > watch_dog_patience:
-                raise RuntimeError("程序终止。")
-    return response
-
-
-
-def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
-    """
-        ⭐单线程方法
-        函数的说明请见 request_llm/bridge_all.py
-    """
-    chatbot.append((inputs, ""))
-
-    _llm_handle = GetInternlmHandle()
-    chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
-    yield from update_ui(chatbot=chatbot, history=[])
-    if not _llm_handle.success: 
-        _llm_handle = None
-        return
-
-    if additional_fn is not None:
-        from core_functional import handle_core_functionality
-        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
-
-    # 处理历史信息
-    history_feedin = []
-    history_feedin.append(["What can I do?", system_prompt] )
-    for i in range(len(history)//2):
-        history_feedin.append([history[2*i], history[2*i+1]] )
-
-    # 开始接收chatglm的回复
-    response = f"[Local Message]: 等待{model_name}响应中 ..."
-    for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
-        chatbot[-1] = (inputs, response)
-        yield from update_ui(chatbot=chatbot, history=history)
-
-    # 总结输出
-    if response == f"[Local Message]: 等待{model_name}响应中 ...":
-        response = f"[Local Message]: {model_name}响应异常 ..."
-    history.extend([inputs, response])
-    yield from update_ui(chatbot=chatbot, history=history)
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name)
\ No newline at end of file
diff --git a/request_llm/local_llm_class.py b/request_llm/local_llm_class.py
new file mode 100644
index 0000000..1470717
--- /dev/null
+++ b/request_llm/local_llm_class.py
@@ -0,0 +1,178 @@
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf, Singleton
+from multiprocessing import Process, Pipe
+
+def SingletonLocalLLM(cls):
+    """
+    一个单实例装饰器
+    """
+    _instance = {}
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+            return _instance[cls]
+        elif _instance[cls].corrupted:
+            _instance[cls] = cls(*args, **kargs)
+            return _instance[cls]
+        else:
+            return _instance[cls]
+    return _singleton
+
+class LocalLLMHandle(Process):
+    def __init__(self):
+        # ⭐主进程执行
+        super().__init__(daemon=True)
+        self.corrupted = False
+        self.load_model_info()
+        self.parent, self.child = Pipe()
+        self.running = True
+        self._model = None
+        self._tokenizer = None
+        self.info = ""
+        self.check_dependency()
+        self.start()
+        self.threadLock = threading.Lock()
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+        self.model_name = ""
+        self.cmd_to_install = ""
+
+    def load_model_and_tokenizer(self):
+        """
+        This function should return the model and the tokenizer
+        """
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+
+    def llm_stream_generator(self, **kwargs):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+        
+    def try_to_import_special_deps(self, **kwargs):
+        """
+        import something that will raise error if the user does not install requirement_*.txt
+        """
+        # ⭐主进程执行
+        raise NotImplementedError("Method not implemented yet")
+
+    def check_dependency(self):
+        # ⭐主进程执行
+        try:
+            self.try_to_import_special_deps()
+            self.info = "依赖检测通过"
+            self.running = True
+        except:
+            self.info = f"缺少{self.model_name}的依赖，如果要使用{self.model_name}，除了基础的pip依赖以外，您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
+            self.running = False
+
+    def run(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        # 第一次运行，加载参数
+        try:
+            self._model, self._tokenizer = self.load_model_and_tokenizer()
+        except:
+            self.running = False
+            from toolbox import trimmed_format_exc
+            self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+            self.child.send('[FinishBad]')
+            raise RuntimeError(f"不能正常加载{self.model_name}的参数！")
+
+        while True:
+            # 进入任务等待状态
+            kwargs = self.child.recv()
+            # 收到消息，开始请求
+            try:
+                for response_full in self.llm_stream_generator(**kwargs):
+                    self.child.send(response_full)
+                self.child.send('[Finish]')
+                # 请求处理结束，开始下一个循环
+            except:
+                from toolbox import trimmed_format_exc
+                self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+                self.child.send('[Finish]')
+
+    def stream_chat(self, **kwargs):
+        # ⭐主进程执行
+        self.threadLock.acquire()
+        self.parent.send(kwargs)
+        while True:
+            res = self.parent.recv()
+            if res == '[Finish]': 
+                break
+            if res == '[FinishBad]': 
+                self.running = False
+                self.corrupted = True
+                break
+            else: 
+                yield res
+        self.threadLock.release()
+    
+
+
+def get_local_llm_predict_fns(LLMSingletonClass, model_name):
+    load_message = f"{model_name}尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，{model_name}消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
+
+    def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+        """
+            ⭐多线程方法
+            函数的说明请见 request_llm/bridge_all.py
+        """
+        _llm_handle = LLMSingletonClass()
+        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
+
+        # chatglm 没有 sys_prompt 接口，因此把prompt加入 history
+        history_feedin = []
+        history_feedin.append(["What can I do?", sys_prompt])
+        for i in range(len(history)//2):
+            history_feedin.append([history[2*i], history[2*i+1]] )
+
+        watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
+        response = ""
+        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+            if len(observe_window) >= 1:
+                observe_window[0] = response
+            if len(observe_window) >= 2:  
+                if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
+        return response
+
+
+
+    def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+        """
+            ⭐单线程方法
+            函数的说明请见 request_llm/bridge_all.py
+        """
+        chatbot.append((inputs, ""))
+
+        _llm_handle = LLMSingletonClass()
+        chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
+        yield from update_ui(chatbot=chatbot, history=[])
+
+        if additional_fn is not None:
+            from core_functional import handle_core_functionality
+            inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+        # 处理历史信息
+        history_feedin = []
+        history_feedin.append(["What can I do?", system_prompt] )
+        for i in range(len(history)//2):
+            history_feedin.append([history[2*i], history[2*i+1]] )
+
+        # 开始接收回复
+        response = f"[Local Message]: 等待{model_name}响应中 ..."
+        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+            chatbot[-1] = (inputs, response)
+            yield from update_ui(chatbot=chatbot, history=history)
+
+        # 总结输出
+        if response == f"[Local Message]: 等待{model_name}响应中 ...":
+            response = f"[Local Message]: {model_name}响应异常 ..."
+        history.extend([inputs, response])
+        yield from update_ui(chatbot=chatbot, history=history)
+
+    return predict_no_ui_long_connection, predict
\ No newline at end of file
diff --git a/request_llm/requirements_chatglm.txt b/request_llm/requirements_chatglm.txt
index fa049ca..b2629f8 100644
--- a/request_llm/requirements_chatglm.txt
+++ b/request_llm/requirements_chatglm.txt
@@ -1,5 +1,5 @@
 protobuf
-transformers==4.27.1
+transformers>=4.27.1
 cpm_kernels
 torch>=1.10
 mdtex2html
diff --git a/request_llm/requirements_chatglm_onnx.txt b/request_llm/requirements_chatglm_onnx.txt
index de072bd..70ab668 100644
--- a/request_llm/requirements_chatglm_onnx.txt
+++ b/request_llm/requirements_chatglm_onnx.txt
@@ -1,5 +1,5 @@
 protobuf
-transformers==4.27.1
+transformers>=4.27.1
 cpm_kernels
 torch>=1.10
 mdtex2html

From 57d4541d4ee8f380358f7b1aab878eb39fa8b3c5 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 7 Aug 2023 01:07:55 +0800
Subject: [PATCH 3/4] fix minor bug in chatglm-onnx

---
 request_llm/bridge_chatglmonnx.py | 245 +-----------------------------
 request_llm/chatglmoonx.py        | 229 ++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 240 deletions(-)
 create mode 100644 request_llm/chatglmoonx.py

diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py
index 4d9844a..cde802a 100644
--- a/request_llm/bridge_chatglmonnx.py
+++ b/request_llm/bridge_chatglmonnx.py
@@ -10,239 +10,7 @@ from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
 from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-# ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
-# ------------------------------------------------------------------------------------------------------------------------
-import re
-import numpy as np
-# import torch
-from onnxruntime import InferenceSession, SessionOptions
-
-
-# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
-# although they are documented as supported on CUDA.
-providers = ["CPUExecutionProvider"]
-
-# if torch.cuda.is_available():
-#     providers = ["CUDAExecutionProvider"] + providers
-
-
-# Default paths
-tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
-onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
-
-
-# input & output names
-past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-output_names = ["logits"] + present_names
-
-
-# default kv_cache for first inference
-default_past_key_values = {
-    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
-}
-
-
-def chat_template(history: list[tuple[str, str]], current: str):
-    prompt = ""
-    chat_round = 0
-    for question, answer in history:
-        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
-        chat_round += 1
-    prompt += f"[Round {chat_round}]\n问：{current}\n答："
-    return prompt
-
-
-def process_response(response: str):
-    response = response.strip()
-    response = response.replace("[[训练时间]]", "2023年")
-    punkts = [
-        [",", "，"],
-        ["!", "！"],
-        [":", "："],
-        [";", "；"],
-        ["\?", "？"],
-    ]
-    for item in punkts:
-        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-    return response
-
-
-class ChatGLMModel():
-
-    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
-        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
-        options = SessionOptions()
-        options.enable_profiling = profile
-        self.session = InferenceSession(onnx_model_path, options, providers=providers)
-        self.eop_token_id = self.tokenizer["<eop>"]
-
-
-    def prepare_input(self, prompt: str):
-        input_ids, prefix_mask = self.tokenizer.encode(prompt)
-
-        input_ids = np.array([input_ids], dtype=np.longlong)
-        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
-
-        return input_ids, prefix_mask, default_past_key_values
-
-
-    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
-        # softmax with temperature
-        exp_logits = np.exp(logits / temperature)
-        probs = exp_logits / np.sum(exp_logits)
-
-        # top k
-        top_k_idx = np.argsort(-probs)[:top_k]
-        top_k_probs = probs[top_k_idx]
-
-        # top p
-        cumsum_probs = np.cumsum(top_k_probs)
-        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
-        top_k_probs = top_k_probs / np.sum(top_k_probs)
-
-        # sample
-        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
-        return next_token[0].item()
-
-
-    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
-        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
-        output_tokens = []
-
-        while True:
-            inputs = {
-                "input_ids": input_ids,
-                "prefix_mask": prefix_mask,
-                "use_past": np.array(len(output_tokens) > 0),
-            }
-            inputs.update(past_key_values)
-
-            logits, *past_key_values = self.session.run(output_names, inputs)
-            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
-
-            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
-            
-            output_tokens += [next_token]
-
-            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
-                break
-
-            input_ids = np.array([[next_token]], dtype=np.longlong)
-            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
-
-            yield process_response(self.tokenizer.decode(output_tokens))
-
-        return process_response(self.tokenizer.decode(output_tokens))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
-# ------------------------------------------------------------------------------------------------------------------------
-
-import re
-from sentencepiece import SentencePieceProcessor
-
-
-def replace_spaces_with_blank(match: re.Match[str]):
-    return f"<|blank_{len(match.group())}|>"
-
-
-def replace_blank_with_spaces(match: re.Match[str]):
-    return " " * int(match.group(1))
-
-
-class ChatGLMTokenizer:
-    def __init__(self, vocab_file):
-        assert vocab_file is not None
-        self.vocab_file = vocab_file
-        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
-        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
-
-    def __len__(self):
-        return len(self.text_tokenizer)
-
-    def __getitem__(self, key: str):
-        return self.text_tokenizer[key]
-
-
-    def preprocess(self, text: str, linebreak=True, whitespaces=True):
-        if linebreak:
-            text = text.replace("\n", "<n>")
-        if whitespaces:
-            text = text.replace("\t", "<|tab|>")
-            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
-        return text
-
-
-    def encode(
-        self, text: str, text_pair: str = None,
-        linebreak=True, whitespaces=True,
-        add_dummy_prefix=True, special_tokens=True,
-    ) -> tuple[list[int], list[int]]:
-        """
-        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
-        text_pair: causal LM part.
-        linebreak: Whether to encode newline (\n) in text.
-        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self.preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-
-        tokens = self.text_tokenizer.encode(text)
-        prefix_mask = [1] * len(tokens)
-        if special_tokens:
-            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
-            prefix_mask += [1, 0]
-
-        if text_pair is not None:
-            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
-            pair_tokens = self.text_tokenizer.encode(text_pair)
-            tokens += pair_tokens
-            prefix_mask += [0] * len(pair_tokens)
-            if special_tokens:
-                tokens += [self.text_tokenizer["<eop>"]]
-                prefix_mask += [0]
-
-        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
-
-
-    def decode(self, text_ids: list[int]) -> str:
-        text = self.text_tokenizer.decode(text_ids)
-        text = text.replace("<n>", "\n")
-        text = text.replace("<|tab|>", "\t")
-        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
-        return text
+from .chatglmoonx import ChatGLMModel, chat_template
 
 
 
@@ -274,19 +42,16 @@ class GetONNXGLMHandle(LocalLLMHandle):
     def llm_stream_generator(self, **kwargs):
         # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
         def adaptor(kwargs):
-            model = self._model
-            tokenizer = self._tokenizer
-            prompt = kwargs['query']
+            query = kwargs['query']
             max_length = kwargs['max_length']
             top_p = kwargs['top_p']
             temperature = kwargs['temperature']
             history = kwargs['history']
-            real_prompt = combine_history(prompt, history)
-            return model, tokenizer, real_prompt, max_length, top_p, temperature
+            return query, max_length, top_p, temperature, history
 
-        model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs)
+        query, max_length, top_p, temperature, history = adaptor(kwargs)
 
-        prompt = chat_template(history, question)
+        prompt = chat_template(history, query)
         for answer in self._model.generate_iterate(
             prompt,
             max_generated_tokens=max_length,
diff --git a/request_llm/chatglmoonx.py b/request_llm/chatglmoonx.py
new file mode 100644
index 0000000..444181e
--- /dev/null
+++ b/request_llm/chatglmoonx.py
@@ -0,0 +1,229 @@
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
+# ------------------------------------------------------------------------------------------------------------------------
+import re
+import numpy as np
+# import torch
+from onnxruntime import InferenceSession, SessionOptions
+
+
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+
+
+# Default paths
+tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
+onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+
+
+# input & output names
+past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+output_names = ["logits"] + present_names
+
+
+# default kv_cache for first inference
+default_past_key_values = {
+    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
+}
+
+
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+
+
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+
+
+class ChatGLMModel():
+
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+
+
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+
+        return input_ids, prefix_mask, default_past_key_values
+
+
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+
+
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+
+            logits, *past_key_values = self.session.run(output_names, inputs)
+            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
+
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            
+            output_tokens += [next_token]
+
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+
+            yield process_response(self.tokenizer.decode(output_tokens))
+
+        return process_response(self.tokenizer.decode(output_tokens))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
+# ------------------------------------------------------------------------------------------------------------------------
+
+import re
+from sentencepiece import SentencePieceProcessor
+
+
+def replace_spaces_with_blank(match: re.Match[str]):
+    return f"<|blank_{len(match.group())}|>"
+
+
+def replace_blank_with_spaces(match: re.Match[str]):
+    return " " * int(match.group(1))
+
+
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+
+    def __len__(self):
+        return len(self.text_tokenizer)
+
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+
+
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = text.replace("\t", "<|tab|>")
+            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
+        return text
+
+
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+
+        if text_pair is not None:
+            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+
+
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
+        return text
+
+

From 0a37106692f01bcedcf19f4c2aea1dc8b954faea Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 7 Aug 2023 01:11:44 +0800
Subject: [PATCH 4/4] reverse cmd_to_install

---
 config.py                         | 2 +-
 request_llm/bridge_chatglmonnx.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config.py b/config.py
index c12d718..1d43dd4 100644
--- a/config.py
+++ b/config.py
@@ -70,7 +70,7 @@ MAX_RETRY = 2
 
 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
 LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
-AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "internlm", "moss", "newbing", "stack-claude"]
+AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
 # P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
 
 
diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py
index cde802a..fbe64b4 100644
--- a/request_llm/bridge_chatglmonnx.py
+++ b/request_llm/bridge_chatglmonnx.py
@@ -1,5 +1,5 @@
 model_name = "ChatGLM-ONNX"
-cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`"
+cmd_to_install = "`pip install request_llm/requirements_chatglm_onnx.txt`"
 
 
 from transformers import AutoModel, AutoTokenizer