From 57d4541d4ee8f380358f7b1aab878eb39fa8b3c5 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 7 Aug 2023 01:07:55 +0800 Subject: [PATCH] fix minor bug in chatglm-onnx --- request_llm/bridge_chatglmonnx.py | 245 +----------------------------- request_llm/chatglmoonx.py | 229 ++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 240 deletions(-) create mode 100644 request_llm/chatglmoonx.py diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py index 4d9844a..cde802a 100644 --- a/request_llm/bridge_chatglmonnx.py +++ b/request_llm/bridge_chatglmonnx.py @@ -10,239 +10,7 @@ from toolbox import update_ui, get_conf from multiprocessing import Process, Pipe from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM - - - - - - - - - - - - - -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py -# ------------------------------------------------------------------------------------------------------------------------ -import re -import numpy as np -# import torch -from onnxruntime import InferenceSession, SessionOptions - - -# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, -# although they are documented as supported on CUDA. -providers = ["CPUExecutionProvider"] - -# if torch.cuda.is_available(): -# providers = ["CUDAExecutionProvider"] + providers - - -# Default paths -tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model" -onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" - - -# input & output names -past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] -present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] -output_names = ["logits"] + present_names - - -# default kv_cache for first inference -default_past_key_values = { - k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names -} - - -def chat_template(history: list[tuple[str, str]], current: str): - prompt = "" - chat_round = 0 - for question, answer in history: - prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" - chat_round += 1 - prompt += f"[Round {chat_round}]\n问:{current}\n答:" - return prompt - - -def process_response(response: str): - response = response.strip() - response = response.replace("[[训练时间]]", "2023年") - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) - return response - - -class ChatGLMModel(): - - def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: - self.tokenizer = ChatGLMTokenizer(tokenizer_path) - options = SessionOptions() - options.enable_profiling = profile - self.session = InferenceSession(onnx_model_path, options, providers=providers) - self.eop_token_id = self.tokenizer[""] - - - def prepare_input(self, prompt: str): - input_ids, prefix_mask = self.tokenizer.encode(prompt) - - input_ids = np.array([input_ids], dtype=np.longlong) - prefix_mask = np.array([prefix_mask], dtype=np.longlong) - - return input_ids, prefix_mask, default_past_key_values - - - def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): - # softmax with temperature - exp_logits = np.exp(logits / temperature) - probs = exp_logits / np.sum(exp_logits) - - # top k - top_k_idx = np.argsort(-probs)[:top_k] - top_k_probs = probs[top_k_idx] - - # top p - cumsum_probs = np.cumsum(top_k_probs) - top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 - top_k_probs = top_k_probs / np.sum(top_k_probs) - - # sample - next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) - return next_token[0].item() - - - def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): - input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) - output_tokens = [] - - while True: - inputs = { - "input_ids": input_ids, - "prefix_mask": prefix_mask, - "use_past": np.array(len(output_tokens) > 0), - } - inputs.update(past_key_values) - - logits, *past_key_values = self.session.run(output_names, inputs) - past_key_values = { k: v for k, v in zip(past_names, past_key_values) } - - next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) - - output_tokens += [next_token] - - if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: - break - - input_ids = np.array([[next_token]], dtype=np.longlong) - prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) - - yield process_response(self.tokenizer.decode(output_tokens)) - - return process_response(self.tokenizer.decode(output_tokens)) - - - - - - - - - - - - - - -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py -# ------------------------------------------------------------------------------------------------------------------------ - -import re -from sentencepiece import SentencePieceProcessor - - -def replace_spaces_with_blank(match: re.Match[str]): - return f"<|blank_{len(match.group())}|>" - - -def replace_blank_with_spaces(match: re.Match[str]): - return " " * int(match.group(1)) - - -class ChatGLMTokenizer: - def __init__(self, vocab_file): - assert vocab_file is not None - self.vocab_file = vocab_file - self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] - self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) - - def __len__(self): - return len(self.text_tokenizer) - - def __getitem__(self, key: str): - return self.text_tokenizer[key] - - - def preprocess(self, text: str, linebreak=True, whitespaces=True): - if linebreak: - text = text.replace("\n", "") - if whitespaces: - text = text.replace("\t", "<|tab|>") - text = re.sub(r" {2,80}", replace_spaces_with_blank, text) - return text - - - def encode( - self, text: str, text_pair: str = None, - linebreak=True, whitespaces=True, - add_dummy_prefix=True, special_tokens=True, - ) -> tuple[list[int], list[int]]: - """ - text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. - text_pair: causal LM part. - linebreak: Whether to encode newline (\n) in text. - whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. - special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. - add_dummy_prefix: Whether to add dummy blank space in the beginning. - """ - text = self.preprocess(text, linebreak, whitespaces) - if not add_dummy_prefix: - text = "" + text - - tokens = self.text_tokenizer.encode(text) - prefix_mask = [1] * len(tokens) - if special_tokens: - tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] - prefix_mask += [1, 0] - - if text_pair is not None: - text_pair = self.preprocess(text_pair, linebreak, whitespaces) - pair_tokens = self.text_tokenizer.encode(text_pair) - tokens += pair_tokens - prefix_mask += [0] * len(pair_tokens) - if special_tokens: - tokens += [self.text_tokenizer[""]] - prefix_mask += [0] - - return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask - - - def decode(self, text_ids: list[int]) -> str: - text = self.text_tokenizer.decode(text_ids) - text = text.replace("", "\n") - text = text.replace("<|tab|>", "\t") - text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text) - return text +from .chatglmoonx import ChatGLMModel, chat_template @@ -274,19 +42,16 @@ class GetONNXGLMHandle(LocalLLMHandle): def llm_stream_generator(self, **kwargs): # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 def adaptor(kwargs): - model = self._model - tokenizer = self._tokenizer - prompt = kwargs['query'] + query = kwargs['query'] max_length = kwargs['max_length'] top_p = kwargs['top_p'] temperature = kwargs['temperature'] history = kwargs['history'] - real_prompt = combine_history(prompt, history) - return model, tokenizer, real_prompt, max_length, top_p, temperature + return query, max_length, top_p, temperature, history - model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs) + query, max_length, top_p, temperature, history = adaptor(kwargs) - prompt = chat_template(history, question) + prompt = chat_template(history, query) for answer in self._model.generate_iterate( prompt, max_generated_tokens=max_length, diff --git a/request_llm/chatglmoonx.py b/request_llm/chatglmoonx.py new file mode 100644 index 0000000..444181e --- /dev/null +++ b/request_llm/chatglmoonx.py @@ -0,0 +1,229 @@ + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py +# ------------------------------------------------------------------------------------------------------------------------ +import re +import numpy as np +# import torch +from onnxruntime import InferenceSession, SessionOptions + + +# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, +# although they are documented as supported on CUDA. +providers = ["CPUExecutionProvider"] + +# if torch.cuda.is_available(): +# providers = ["CUDAExecutionProvider"] + providers + + +# Default paths +tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model" +onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" + + +# input & output names +past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] +present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] +output_names = ["logits"] + present_names + + +# default kv_cache for first inference +default_past_key_values = { + k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names +} + + +def chat_template(history: list[tuple[str, str]], current: str): + prompt = "" + chat_round = 0 + for question, answer in history: + prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" + chat_round += 1 + prompt += f"[Round {chat_round}]\n问:{current}\n答:" + return prompt + + +def process_response(response: str): + response = response.strip() + response = response.replace("[[训练时间]]", "2023年") + punkts = [ + [",", ","], + ["!", "!"], + [":", ":"], + [";", ";"], + ["\?", "?"], + ] + for item in punkts: + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) + return response + + +class ChatGLMModel(): + + def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: + self.tokenizer = ChatGLMTokenizer(tokenizer_path) + options = SessionOptions() + options.enable_profiling = profile + self.session = InferenceSession(onnx_model_path, options, providers=providers) + self.eop_token_id = self.tokenizer[""] + + + def prepare_input(self, prompt: str): + input_ids, prefix_mask = self.tokenizer.encode(prompt) + + input_ids = np.array([input_ids], dtype=np.longlong) + prefix_mask = np.array([prefix_mask], dtype=np.longlong) + + return input_ids, prefix_mask, default_past_key_values + + + def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): + # softmax with temperature + exp_logits = np.exp(logits / temperature) + probs = exp_logits / np.sum(exp_logits) + + # top k + top_k_idx = np.argsort(-probs)[:top_k] + top_k_probs = probs[top_k_idx] + + # top p + cumsum_probs = np.cumsum(top_k_probs) + top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 + top_k_probs = top_k_probs / np.sum(top_k_probs) + + # sample + next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) + return next_token[0].item() + + + def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): + input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) + output_tokens = [] + + while True: + inputs = { + "input_ids": input_ids, + "prefix_mask": prefix_mask, + "use_past": np.array(len(output_tokens) > 0), + } + inputs.update(past_key_values) + + logits, *past_key_values = self.session.run(output_names, inputs) + past_key_values = { k: v for k, v in zip(past_names, past_key_values) } + + next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) + + output_tokens += [next_token] + + if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: + break + + input_ids = np.array([[next_token]], dtype=np.longlong) + prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) + + yield process_response(self.tokenizer.decode(output_tokens)) + + return process_response(self.tokenizer.decode(output_tokens)) + + + + + + + + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py +# ------------------------------------------------------------------------------------------------------------------------ + +import re +from sentencepiece import SentencePieceProcessor + + +def replace_spaces_with_blank(match: re.Match[str]): + return f"<|blank_{len(match.group())}|>" + + +def replace_blank_with_spaces(match: re.Match[str]): + return " " * int(match.group(1)) + + +class ChatGLMTokenizer: + def __init__(self, vocab_file): + assert vocab_file is not None + self.vocab_file = vocab_file + self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] + self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) + + def __len__(self): + return len(self.text_tokenizer) + + def __getitem__(self, key: str): + return self.text_tokenizer[key] + + + def preprocess(self, text: str, linebreak=True, whitespaces=True): + if linebreak: + text = text.replace("\n", "") + if whitespaces: + text = text.replace("\t", "<|tab|>") + text = re.sub(r" {2,80}", replace_spaces_with_blank, text) + return text + + + def encode( + self, text: str, text_pair: str = None, + linebreak=True, whitespaces=True, + add_dummy_prefix=True, special_tokens=True, + ) -> tuple[list[int], list[int]]: + """ + text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. + text_pair: causal LM part. + linebreak: Whether to encode newline (\n) in text. + whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. + special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. + add_dummy_prefix: Whether to add dummy blank space in the beginning. + """ + text = self.preprocess(text, linebreak, whitespaces) + if not add_dummy_prefix: + text = "" + text + + tokens = self.text_tokenizer.encode(text) + prefix_mask = [1] * len(tokens) + if special_tokens: + tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] + prefix_mask += [1, 0] + + if text_pair is not None: + text_pair = self.preprocess(text_pair, linebreak, whitespaces) + pair_tokens = self.text_tokenizer.encode(text_pair) + tokens += pair_tokens + prefix_mask += [0] * len(pair_tokens) + if special_tokens: + tokens += [self.text_tokenizer[""]] + prefix_mask += [0] + + return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask + + + def decode(self, text_ids: list[int]) -> str: + text = self.text_tokenizer.decode(text_ids) + text = text.replace("", "\n") + text = text.replace("<|tab|>", "\t") + text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text) + return text + +