From 57d4541d4ee8f380358f7b1aab878eb39fa8b3c5 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 7 Aug 2023 01:07:55 +0800
Subject: [PATCH] fix minor bug in chatglm-onnx

---
 request_llm/bridge_chatglmonnx.py | 245 +-----------------------------
 request_llm/chatglmoonx.py        | 229 ++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 240 deletions(-)
 create mode 100644 request_llm/chatglmoonx.py
diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py
index 4d9844a..cde802a 100644
--- a/request_llm/bridge_chatglmonnx.py
+++ b/request_llm/bridge_chatglmonnx.py
@@ -10,239 +10,7 @@ from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
 from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-# ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
-# ------------------------------------------------------------------------------------------------------------------------
-import re
-import numpy as np
-# import torch
-from onnxruntime import InferenceSession, SessionOptions
-
-
-# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
-# although they are documented as supported on CUDA.
-providers = ["CPUExecutionProvider"]
-
-# if torch.cuda.is_available():
-#     providers = ["CUDAExecutionProvider"] + providers
-
-
-# Default paths
-tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
-onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
-
-
-# input & output names
-past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
-output_names = ["logits"] + present_names
-
-
-# default kv_cache for first inference
-default_past_key_values = {
-    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
-}
-
-
-def chat_template(history: list[tuple[str, str]], current: str):
-    prompt = ""
-    chat_round = 0
-    for question, answer in history:
-        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
-        chat_round += 1
-    prompt += f"[Round {chat_round}]\n问：{current}\n答："
-    return prompt
-
-
-def process_response(response: str):
-    response = response.strip()
-    response = response.replace("[[训练时间]]", "2023年")
-    punkts = [
-        [",", "，"],
-        ["!", "！"],
-        [":", "："],
-        [";", "；"],
-        ["\?", "？"],
-    ]
-    for item in punkts:
-        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-    return response
-
-
-class ChatGLMModel():
-
-    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
-        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
-        options = SessionOptions()
-        options.enable_profiling = profile
-        self.session = InferenceSession(onnx_model_path, options, providers=providers)
-        self.eop_token_id = self.tokenizer["<eop>"]
-
-
-    def prepare_input(self, prompt: str):
-        input_ids, prefix_mask = self.tokenizer.encode(prompt)
-
-        input_ids = np.array([input_ids], dtype=np.longlong)
-        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
-
-        return input_ids, prefix_mask, default_past_key_values
-
-
-    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
-        # softmax with temperature
-        exp_logits = np.exp(logits / temperature)
-        probs = exp_logits / np.sum(exp_logits)
-
-        # top k
-        top_k_idx = np.argsort(-probs)[:top_k]
-        top_k_probs = probs[top_k_idx]
-
-        # top p
-        cumsum_probs = np.cumsum(top_k_probs)
-        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
-        top_k_probs = top_k_probs / np.sum(top_k_probs)
-
-        # sample
-        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
-        return next_token[0].item()
-
-
-    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
-        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
-        output_tokens = []
-
-        while True:
-            inputs = {
-                "input_ids": input_ids,
-                "prefix_mask": prefix_mask,
-                "use_past": np.array(len(output_tokens) > 0),
-            }
-            inputs.update(past_key_values)
-
-            logits, *past_key_values = self.session.run(output_names, inputs)
-            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
-
-            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
-            
-            output_tokens += [next_token]
-
-            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
-                break
-
-            input_ids = np.array([[next_token]], dtype=np.longlong)
-            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
-
-            yield process_response(self.tokenizer.decode(output_tokens))
-
-        return process_response(self.tokenizer.decode(output_tokens))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# ------------------------------------------------------------------------------------------------------------------------
-# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
-# ------------------------------------------------------------------------------------------------------------------------
-
-import re
-from sentencepiece import SentencePieceProcessor
-
-
-def replace_spaces_with_blank(match: re.Match[str]):
-    return f"<|blank_{len(match.group())}|>"
-
-
-def replace_blank_with_spaces(match: re.Match[str]):
-    return " " * int(match.group(1))
-
-
-class ChatGLMTokenizer:
-    def __init__(self, vocab_file):
-        assert vocab_file is not None
-        self.vocab_file = vocab_file
-        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
-        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
-
-    def __len__(self):
-        return len(self.text_tokenizer)
-
-    def __getitem__(self, key: str):
-        return self.text_tokenizer[key]
-
-
-    def preprocess(self, text: str, linebreak=True, whitespaces=True):
-        if linebreak:
-            text = text.replace("\n", "<n>")
-        if whitespaces:
-            text = text.replace("\t", "<|tab|>")
-            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
-        return text
-
-
-    def encode(
-        self, text: str, text_pair: str = None,
-        linebreak=True, whitespaces=True,
-        add_dummy_prefix=True, special_tokens=True,
-    ) -> tuple[list[int], list[int]]:
-        """
-        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
-        text_pair: causal LM part.
-        linebreak: Whether to encode newline (\n) in text.
-        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self.preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-
-        tokens = self.text_tokenizer.encode(text)
-        prefix_mask = [1] * len(tokens)
-        if special_tokens:
-            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
-            prefix_mask += [1, 0]
-
-        if text_pair is not None:
-            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
-            pair_tokens = self.text_tokenizer.encode(text_pair)
-            tokens += pair_tokens
-            prefix_mask += [0] * len(pair_tokens)
-            if special_tokens:
-                tokens += [self.text_tokenizer["<eop>"]]
-                prefix_mask += [0]
-
-        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
-
-
-    def decode(self, text_ids: list[int]) -> str:
-        text = self.text_tokenizer.decode(text_ids)
-        text = text.replace("<n>", "\n")
-        text = text.replace("<|tab|>", "\t")
-        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
-        return text
+from .chatglmoonx import ChatGLMModel, chat_template
 
 
 
@@ -274,19 +42,16 @@ class GetONNXGLMHandle(LocalLLMHandle):
     def llm_stream_generator(self, **kwargs):
         # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
         def adaptor(kwargs):
-            model = self._model
-            tokenizer = self._tokenizer
-            prompt = kwargs['query']
+            query = kwargs['query']
             max_length = kwargs['max_length']
             top_p = kwargs['top_p']
             temperature = kwargs['temperature']
             history = kwargs['history']
-            real_prompt = combine_history(prompt, history)
-            return model, tokenizer, real_prompt, max_length, top_p, temperature
+            return query, max_length, top_p, temperature, history
 
-        model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs)
+        query, max_length, top_p, temperature, history = adaptor(kwargs)
 
-        prompt = chat_template(history, question)
+        prompt = chat_template(history, query)
         for answer in self._model.generate_iterate(
             prompt,
             max_generated_tokens=max_length,
diff --git a/request_llm/chatglmoonx.py b/request_llm/chatglmoonx.py
new file mode 100644
index 0000000..444181e
--- /dev/null
+++ b/request_llm/chatglmoonx.py
@@ -0,0 +1,229 @@
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
+# ------------------------------------------------------------------------------------------------------------------------
+import re
+import numpy as np
+# import torch
+from onnxruntime import InferenceSession, SessionOptions
+
+
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+
+
+# Default paths
+tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
+onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+
+
+# input & output names
+past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+output_names = ["logits"] + present_names
+
+
+# default kv_cache for first inference
+default_past_key_values = {
+    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
+}
+
+
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+
+
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+
+
+class ChatGLMModel():
+
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+
+
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+
+        return input_ids, prefix_mask, default_past_key_values
+
+
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+
+
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+
+            logits, *past_key_values = self.session.run(output_names, inputs)
+            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
+
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            
+            output_tokens += [next_token]
+
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+
+            yield process_response(self.tokenizer.decode(output_tokens))
+
+        return process_response(self.tokenizer.decode(output_tokens))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
+# ------------------------------------------------------------------------------------------------------------------------
+
+import re
+from sentencepiece import SentencePieceProcessor
+
+
+def replace_spaces_with_blank(match: re.Match[str]):
+    return f"<|blank_{len(match.group())}|>"
+
+
+def replace_blank_with_spaces(match: re.Match[str]):
+    return " " * int(match.group(1))
+
+
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+
+    def __len__(self):
+        return len(self.text_tokenizer)
+
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+
+
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = text.replace("\t", "<|tab|>")
+            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
+        return text
+
+
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+
+        if text_pair is not None:
+            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+
+
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
+        return text
+
+