From c0c337988fa5c154676cdcd52a9f8b6780ce0f58 Mon Sep 17 00:00:00 2001 From: ValeriaWong <763717425@qq.com> Date: Tue, 1 Aug 2023 00:48:57 +0800 Subject: [PATCH 1/4] =?UTF-8?q?feat(chatglm=5Fint8=5Fonnx):=E7=BA=AFCPU?= =?UTF-8?q?=E6=8E=A8=E7=90=86=EF=BC=8C=E6=9C=80=E5=A4=9A=E4=BB=85=E9=9C=80?= =?UTF-8?q?8GB=E5=86=85=E5=AD=98=EF=BC=8C=E6=8E=A8=E7=90=86=E9=80=9F?= =?UTF-8?q?=E5=BA=A6=E6=9C=AA=E6=B5=8B=E8=AF=84=EF=BC=8Ctoken=E6=95=B0?= =?UTF-8?q?=E6=9C=89=E9=99=90=EF=BC=8C=E6=9A=82=E6=97=B6=E8=BF=98=E4=B8=8D?= =?UTF-8?q?=E8=83=BD=E6=B5=81=E5=BC=8F=E8=BE=93=E5=87=BA=20#1008?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 2 +- request_llm/bridge_all.py | 11 +- request_llm/bridge_chatglm_onnx.py | 354 ++++++++++++++++++++++ request_llm/requirements_chatglm_onnx.txt | 11 + 4 files changed, 376 insertions(+), 2 deletions(-) create mode 100644 request_llm/bridge_chatglm_onnx.py create mode 100644 request_llm/requirements_chatglm_onnx.txt diff --git a/config.py b/config.py index 0234bb4..659589f 100644 --- a/config.py +++ b/config.py @@ -70,7 +70,7 @@ MAX_RETRY = 2 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 ) LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓ -AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"] +AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm","chatglm_onnx","moss", "newbing", "stack-claude"] # P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py index c40c2e7..75e448e 100644 --- a/request_llm/bridge_all.py +++ b/request_llm/bridge_all.py @@ -19,6 +19,8 @@ from .bridge_chatgpt import predict as chatgpt_ui from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui from .bridge_chatglm import predict as chatglm_ui +from .bridge_chatglm_onnx import predict_no_ui_long_connection as chatglm_onnx_noui +from .bridge_chatglm_onnx import predict as chatglm_onnx_ui # from .bridge_tgui import predict_no_ui_long_connection as tgui_noui # from .bridge_tgui import predict as tgui_ui @@ -164,7 +166,14 @@ model_info = { "tokenizer": tokenizer_gpt35, "token_cnt": get_token_num_gpt35, }, - + "chatglm_onnx": { + "fn_with_ui": chatglm_onnx_ui, + "fn_without_ui": chatglm_onnx_noui, + "endpoint": None, + "max_token": 1024, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + }, } diff --git a/request_llm/bridge_chatglm_onnx.py b/request_llm/bridge_chatglm_onnx.py new file mode 100644 index 0000000..636b38d --- /dev/null +++ b/request_llm/bridge_chatglm_onnx.py @@ -0,0 +1,354 @@ +import re +import threading +from toolbox import update_ui, get_conf +from multiprocessing import Process, Pipe +import numpy as np +from onnxruntime import InferenceSession, SessionOptions +from sentencepiece import SentencePieceProcessor + + +# 模型来源 K024/ChatGLM-6b-onnx-u8s8 + +global glm_onnx_handle + + +glm_onnx_handle = None +load_message = "ChatGLM_onnx尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,ChatGLM_onnx消耗大量的内存(CPU)或显存(GPU),也许会导致低配(内存<8GB)计算机卡死 ……" + +# Default paths +tokenizer_path = "YOUR/TOKENIZER_PATH/sentencepiece.model" +onnx_model_path = "YOUR/TOKENIZER_PATH/chatglm-6b-int8.onnx" + +# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, +# although they are documented as supported on CUDA. +providers = ["CPUExecutionProvider"] + +# if torch.cuda.is_available(): +# providers = ["CUDAExecutionProvider"] + providers + + +################################################################################# +class GetGLMHandle(Process): + + def __init__(self): + super().__init__(daemon=True) + self.parent, self.child = Pipe() + self.ChatGLM_onnx_model = None # tokenizer_path + self.ChatGLM_onnx_tokenizer = None # onnx_model_path + self.info = "" + self.success = True + self.check_dependency() + self.start() + self.threadLock = threading.Lock() + + def check_dependency(self): + try: + import sentencepiece + self.info = "依赖检测通过" + self.success = True + except: + self.info = "缺少ChatGLM_onnx的依赖,如果要使用ChatGLM_onnx,除了基础的pip依赖以外,您还需要运行`pip install -r request_llm/requirements_ChatGLM_onnx.txt`安装ChatGLM_onnx的依赖。" + self.success = False + + def ready(self): + return self.ChatGLM_onnx_model is not None + + + def run(self): + # 子进程执行 + # 第一次运行,加载参数 + retry = 0 + while True: + try: + if self.ChatGLM_onnx_model is None: + # Initialize the ChatGLMModel and ChatGLMTokenizer + self.ChatGLM_onnx_model = ChatGLMModel() + self.ChatGLM_onnx_tokenizer = ChatGLMTokenizer() + break + else: + break + except: + retry += 1 + if retry > 3: + self.child.send('[Local Message] Call ChatGLM_onnx fail 不能正常加载ChatGLM_onnx的参数。') + raise RuntimeError("不能正常加载ChatGLM_onnx的参数!") + + while True: + # 进入任务等待状态 + kwargs = self.child.recv() + # 收到消息,开始请求 + try: + # Use the ChatGLMModel and ChatGLMTokenizer to generate a response + response = tuple(self.ChatGLM_onnx_model.generate_iterate(kwargs['query'])) + + # Send the output data + self.child.send(response[-1]) + except: + from toolbox import trimmed_format_exc + self.child.send('[Local Message] Call ChatGLM_onnx fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n') + # 请求处理结束,开始下一个循环 + self.child.send('[Finish]') + + + + def stream_chat(self, **kwargs): + # 主进程执行 + self.threadLock.acquire() + self.parent.send(kwargs) + while True: + res = self.parent.recv() + if res != '[Finish]': + yield res + else: + break + self.threadLock.release() + + +################################################################################# +class ChatGLMModel(): + + def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: + self.tokenizer = ChatGLMTokenizer(tokenizer_path) + options = SessionOptions() + options.enable_profiling = profile + self.session = InferenceSession(onnx_model_path, options, providers=providers) + self.eop_token_id = self.tokenizer[""] + # input & output names + self.past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] + self.present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] + self.output_names = ["logits"] + self.present_names + + # default kv_cache for first inference + self.default_past_key_values = { + k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in self.past_names + } + + def prepare_input(self, prompt: str): + input_ids, prefix_mask = self.tokenizer.encode(prompt) + + input_ids = np.array([input_ids], dtype=np.longlong) + prefix_mask = np.array([prefix_mask], dtype=np.longlong) + + return input_ids, prefix_mask, self.default_past_key_values + + + def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): + # softmax with temperature + exp_logits = np.exp(logits / temperature) + probs = exp_logits / np.sum(exp_logits) + + # top k + top_k_idx = np.argsort(-probs)[:top_k] + top_k_probs = probs[top_k_idx] + + # top p + cumsum_probs = np.cumsum(top_k_probs) + top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 + top_k_probs = top_k_probs / np.sum(top_k_probs) + + # sample + next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) + return next_token[0].item() + + + def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): + input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) + output_tokens = [] + + while True: + inputs = { + "input_ids": input_ids, + "prefix_mask": prefix_mask, + "use_past": np.array(len(output_tokens) > 0), + } + inputs.update(past_key_values) + + logits, *past_key_values = self.session.run(self.output_names, inputs) + past_key_values = { k: v for k, v in zip(self.past_names, past_key_values) } + + next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) + + output_tokens += [next_token] + + if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: + break + + input_ids = np.array([[next_token]], dtype=np.longlong) + prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) + + yield process_response(self.tokenizer.decode(output_tokens)) + + return process_response(self.tokenizer.decode(output_tokens)) + +class ChatGLMTokenizer: + def __init__(self, vocab_file): + assert vocab_file is not None + self.vocab_file = vocab_file + self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] + self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) + + def __len__(self): + return len(self.text_tokenizer) + + def __getitem__(self, key: str): + return self.text_tokenizer[key] + + + def preprocess(self, text: str, linebreak=True, whitespaces=True): + if linebreak: + text = text.replace("\\n", "") + if whitespaces: + text = text.replace("\\t", "<|tab|>") + text = re.sub(r" {2,80}", self.replace_spaces_with_blank, text) + return text + + + def encode( + self, text: str, text_pair: str = None, + linebreak=True, whitespaces=True, + add_dummy_prefix=True, special_tokens=True, + ) -> tuple[list[int], list[int]]: + """ + text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. + text_pair: causal LM part. + linebreak: Whether to encode newline (\n) in text. + whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. + special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. + add_dummy_prefix: Whether to add dummy blank space in the beginning. + """ + text = self.preprocess(text, linebreak, whitespaces) + if not add_dummy_prefix: + text = "" + text + + tokens = self.text_tokenizer.encode(text) + prefix_mask = [1] * len(tokens) + if special_tokens: + tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] + prefix_mask += [1, 0] + + if text_pair is not None: + text_pair = self.preprocess(text_pair, linebreak, whitespaces) + pair_tokens = self.text_tokenizer.encode(text_pair) + tokens += pair_tokens + prefix_mask += [0] * len(pair_tokens) + if special_tokens: + tokens += [self.text_tokenizer[""]] + prefix_mask += [0] + + return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask + + + def decode(self, text_ids: list[int]) -> str: + text = self.text_tokenizer.decode(text_ids) + text = text.replace("", "\n") + text = text.replace("<|tab|>", "\t") + text = re.sub(r"<\|blank_(\d\d?)\|>", self.replace_blank_with_spaces, text) + return text + def replace_spaces_with_blank(match: re.Match[str]): + return f"<|blank_{len(match.group())}|>" + + def replace_blank_with_spaces(match: re.Match[str]): + return " " * int(match.group(1)) + +################################################################################# + + +def chat_template(history: list[tuple[str, str]], current: str): + prompt = "" + chat_round = 0 + for question, answer in history: + prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" + chat_round += 1 + prompt += f"[Round {chat_round}]\n问:{current}\n答:" + return prompt + +def process_response(response: str): + response = response.strip() + response = response.replace("[[训练时间]]", "2023年") + punkts = [ + [",", ","], + ["!", "!"], + [":", ":"], + [";", ";"], + ["\?", "?"], + ] + for item in punkts: + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) + return response + +################################################################################# + + +def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False): + """ + 多线程方法 + 函数的说明请见 request_llm/bridge_all.py + """ + if glm_onnx_handle is None: + glm_onnx_handle = GetGLMHandle() + if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glm_onnx_handle.info + if not glm_onnx_handle.success: + error = glm_onnx_handle.info + glm_onnx_handle = None + raise RuntimeError(error) + + # ChatGLM_onnx doesn't have a sys_prompt interface, so add the prompt to history + history_feedin = [] + history_feedin.append(["What can I do?", sys_prompt]) + for i in range(len(history) // 2): + history_feedin.append([history[2 * i], history[2 * i + 1]]) + + watch_dog_patience = 5 # Watchdog patience, set to 5 seconds + response = "" + for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin): + print(response) + if len(observe_window) >= 1: + observe_window[0] = response + if len(observe_window) >= 2: + if (time.time() - observe_window[1]) > watch_dog_patience: + raise RuntimeError("程序终止。") + return response + +def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream=True, additional_fn=None): + """ + 单线程方法 + 函数的说明请见 request_llm/bridge_all.py + """ + chatbot.append((inputs, "")) + + global glm_onnx_handle + if glm_onnx_handle is None: + glm_onnx_handle = GetGLMHandle() + chatbot[-1] = (inputs, load_message + "\n\n" + glm_onnx_handle.info) + yield from update_ui(chatbot=chatbot, history=[]) + if not glm_onnx_handle.success: + glm_onnx_handle = None + return + + if additional_fn is not None: + import core_functional + importlib.reload(core_functional) # Hot-reload prompt + core_functional = core_functional.get_core_functions() + if "PreProcess" in core_functional[additional_fn]: + inputs = core_functional[additional_fn]["PreProcess"](inputs) + inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"] + + history_feedin = [] + history_feedin.append(["What can I do?", system_prompt]) + for i in range(len(history) // 2): + history_feedin.append([history[2 * i], history[2 * i + 1]]) + + response = "[Local Message]: 等待ChatGLM_onnx响应中 ..." + for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin): + chatbot[-1] = (inputs, response) + yield from update_ui(chatbot=chatbot, history=history) + + if response == "[Local Message]: 等待ChatGLM_onnx响应中 ...": + response = "[Local Message]: ChatGLM_onnx响应异常 ..." + history.extend([inputs, response]) + yield from update_ui(chatbot=chatbot, history=history) + + + + diff --git a/request_llm/requirements_chatglm_onnx.txt b/request_llm/requirements_chatglm_onnx.txt new file mode 100644 index 0000000..de072bd --- /dev/null +++ b/request_llm/requirements_chatglm_onnx.txt @@ -0,0 +1,11 @@ +protobuf +transformers==4.27.1 +cpm_kernels +torch>=1.10 +mdtex2html +sentencepiece +numpy +onnxruntime +sentencepiece +streamlit +streamlit-chat From d7dd586f09f9a0bb013612e0eac4107c2ca3d2b1 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 7 Aug 2023 00:57:52 +0800 Subject: [PATCH 2/4] introduce unified base class for local llm models --- .gitignore | 1 + config.py | 4 +- request_llm/bridge_all.py | 30 +- request_llm/bridge_chatglm_onnx.py | 354 ---------------------- request_llm/bridge_chatglmonnx.py | 308 +++++++++++++++++++ request_llm/bridge_internlm.py | 164 ++-------- request_llm/local_llm_class.py | 178 +++++++++++ request_llm/requirements_chatglm.txt | 2 +- request_llm/requirements_chatglm_onnx.txt | 2 +- 9 files changed, 535 insertions(+), 508 deletions(-) delete mode 100644 request_llm/bridge_chatglm_onnx.py create mode 100644 request_llm/bridge_chatglmonnx.py create mode 100644 request_llm/local_llm_class.py diff --git a/.gitignore b/.gitignore index 55c4db1..c4df287 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,4 @@ multi-language request_llm/moss media flagged +request_llm/ChatGLM-6b-onnx-u8s8 diff --git a/config.py b/config.py index 659589f..c12d718 100644 --- a/config.py +++ b/config.py @@ -70,8 +70,8 @@ MAX_RETRY = 2 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 ) LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓ -AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm","chatglm_onnx","moss", "newbing", "stack-claude"] -# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] +AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "internlm", "moss", "newbing", "stack-claude"] +# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] # ChatGLM(2) Finetune Model Path (如果使用ChatGLM2微调模型,需要把"chatglmft"加入AVAIL_LLM_MODELS中) diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py index 75e448e..f38711d 100644 --- a/request_llm/bridge_all.py +++ b/request_llm/bridge_all.py @@ -19,11 +19,6 @@ from .bridge_chatgpt import predict as chatgpt_ui from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui from .bridge_chatglm import predict as chatglm_ui -from .bridge_chatglm_onnx import predict_no_ui_long_connection as chatglm_onnx_noui -from .bridge_chatglm_onnx import predict as chatglm_onnx_ui -# from .bridge_tgui import predict_no_ui_long_connection as tgui_noui -# from .bridge_tgui import predict as tgui_ui - colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044'] class LazyloadTiktoken(object): @@ -166,14 +161,7 @@ model_info = { "tokenizer": tokenizer_gpt35, "token_cnt": get_token_num_gpt35, }, - "chatglm_onnx": { - "fn_with_ui": chatglm_onnx_ui, - "fn_without_ui": chatglm_onnx_noui, - "endpoint": None, - "max_token": 1024, - "tokenizer": tokenizer_gpt35, - "token_cnt": get_token_num_gpt35, - }, + } @@ -331,6 +319,22 @@ if "internlm" in AVAIL_LLM_MODELS: }) except: print(trimmed_format_exc()) +if "chatglm_onnx" in AVAIL_LLM_MODELS: + try: + from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui + from .bridge_chatglmonnx import predict as chatglm_onnx_ui + model_info.update({ + "chatglm_onnx": { + "fn_with_ui": chatglm_onnx_ui, + "fn_without_ui": chatglm_onnx_noui, + "endpoint": None, + "max_token": 4096, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + } + }) + except: + print(trimmed_format_exc()) def LLM_CATCH_EXCEPTION(f): """ diff --git a/request_llm/bridge_chatglm_onnx.py b/request_llm/bridge_chatglm_onnx.py deleted file mode 100644 index 636b38d..0000000 --- a/request_llm/bridge_chatglm_onnx.py +++ /dev/null @@ -1,354 +0,0 @@ -import re -import threading -from toolbox import update_ui, get_conf -from multiprocessing import Process, Pipe -import numpy as np -from onnxruntime import InferenceSession, SessionOptions -from sentencepiece import SentencePieceProcessor - - -# 模型来源 K024/ChatGLM-6b-onnx-u8s8 - -global glm_onnx_handle - - -glm_onnx_handle = None -load_message = "ChatGLM_onnx尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,ChatGLM_onnx消耗大量的内存(CPU)或显存(GPU),也许会导致低配(内存<8GB)计算机卡死 ……" - -# Default paths -tokenizer_path = "YOUR/TOKENIZER_PATH/sentencepiece.model" -onnx_model_path = "YOUR/TOKENIZER_PATH/chatglm-6b-int8.onnx" - -# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, -# although they are documented as supported on CUDA. -providers = ["CPUExecutionProvider"] - -# if torch.cuda.is_available(): -# providers = ["CUDAExecutionProvider"] + providers - - -################################################################################# -class GetGLMHandle(Process): - - def __init__(self): - super().__init__(daemon=True) - self.parent, self.child = Pipe() - self.ChatGLM_onnx_model = None # tokenizer_path - self.ChatGLM_onnx_tokenizer = None # onnx_model_path - self.info = "" - self.success = True - self.check_dependency() - self.start() - self.threadLock = threading.Lock() - - def check_dependency(self): - try: - import sentencepiece - self.info = "依赖检测通过" - self.success = True - except: - self.info = "缺少ChatGLM_onnx的依赖,如果要使用ChatGLM_onnx,除了基础的pip依赖以外,您还需要运行`pip install -r request_llm/requirements_ChatGLM_onnx.txt`安装ChatGLM_onnx的依赖。" - self.success = False - - def ready(self): - return self.ChatGLM_onnx_model is not None - - - def run(self): - # 子进程执行 - # 第一次运行,加载参数 - retry = 0 - while True: - try: - if self.ChatGLM_onnx_model is None: - # Initialize the ChatGLMModel and ChatGLMTokenizer - self.ChatGLM_onnx_model = ChatGLMModel() - self.ChatGLM_onnx_tokenizer = ChatGLMTokenizer() - break - else: - break - except: - retry += 1 - if retry > 3: - self.child.send('[Local Message] Call ChatGLM_onnx fail 不能正常加载ChatGLM_onnx的参数。') - raise RuntimeError("不能正常加载ChatGLM_onnx的参数!") - - while True: - # 进入任务等待状态 - kwargs = self.child.recv() - # 收到消息,开始请求 - try: - # Use the ChatGLMModel and ChatGLMTokenizer to generate a response - response = tuple(self.ChatGLM_onnx_model.generate_iterate(kwargs['query'])) - - # Send the output data - self.child.send(response[-1]) - except: - from toolbox import trimmed_format_exc - self.child.send('[Local Message] Call ChatGLM_onnx fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n') - # 请求处理结束,开始下一个循环 - self.child.send('[Finish]') - - - - def stream_chat(self, **kwargs): - # 主进程执行 - self.threadLock.acquire() - self.parent.send(kwargs) - while True: - res = self.parent.recv() - if res != '[Finish]': - yield res - else: - break - self.threadLock.release() - - -################################################################################# -class ChatGLMModel(): - - def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: - self.tokenizer = ChatGLMTokenizer(tokenizer_path) - options = SessionOptions() - options.enable_profiling = profile - self.session = InferenceSession(onnx_model_path, options, providers=providers) - self.eop_token_id = self.tokenizer[""] - # input & output names - self.past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] - self.present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] - self.output_names = ["logits"] + self.present_names - - # default kv_cache for first inference - self.default_past_key_values = { - k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in self.past_names - } - - def prepare_input(self, prompt: str): - input_ids, prefix_mask = self.tokenizer.encode(prompt) - - input_ids = np.array([input_ids], dtype=np.longlong) - prefix_mask = np.array([prefix_mask], dtype=np.longlong) - - return input_ids, prefix_mask, self.default_past_key_values - - - def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): - # softmax with temperature - exp_logits = np.exp(logits / temperature) - probs = exp_logits / np.sum(exp_logits) - - # top k - top_k_idx = np.argsort(-probs)[:top_k] - top_k_probs = probs[top_k_idx] - - # top p - cumsum_probs = np.cumsum(top_k_probs) - top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 - top_k_probs = top_k_probs / np.sum(top_k_probs) - - # sample - next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) - return next_token[0].item() - - - def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): - input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) - output_tokens = [] - - while True: - inputs = { - "input_ids": input_ids, - "prefix_mask": prefix_mask, - "use_past": np.array(len(output_tokens) > 0), - } - inputs.update(past_key_values) - - logits, *past_key_values = self.session.run(self.output_names, inputs) - past_key_values = { k: v for k, v in zip(self.past_names, past_key_values) } - - next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) - - output_tokens += [next_token] - - if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: - break - - input_ids = np.array([[next_token]], dtype=np.longlong) - prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) - - yield process_response(self.tokenizer.decode(output_tokens)) - - return process_response(self.tokenizer.decode(output_tokens)) - -class ChatGLMTokenizer: - def __init__(self, vocab_file): - assert vocab_file is not None - self.vocab_file = vocab_file - self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] - self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) - - def __len__(self): - return len(self.text_tokenizer) - - def __getitem__(self, key: str): - return self.text_tokenizer[key] - - - def preprocess(self, text: str, linebreak=True, whitespaces=True): - if linebreak: - text = text.replace("\\n", "") - if whitespaces: - text = text.replace("\\t", "<|tab|>") - text = re.sub(r" {2,80}", self.replace_spaces_with_blank, text) - return text - - - def encode( - self, text: str, text_pair: str = None, - linebreak=True, whitespaces=True, - add_dummy_prefix=True, special_tokens=True, - ) -> tuple[list[int], list[int]]: - """ - text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. - text_pair: causal LM part. - linebreak: Whether to encode newline (\n) in text. - whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. - special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. - add_dummy_prefix: Whether to add dummy blank space in the beginning. - """ - text = self.preprocess(text, linebreak, whitespaces) - if not add_dummy_prefix: - text = "" + text - - tokens = self.text_tokenizer.encode(text) - prefix_mask = [1] * len(tokens) - if special_tokens: - tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] - prefix_mask += [1, 0] - - if text_pair is not None: - text_pair = self.preprocess(text_pair, linebreak, whitespaces) - pair_tokens = self.text_tokenizer.encode(text_pair) - tokens += pair_tokens - prefix_mask += [0] * len(pair_tokens) - if special_tokens: - tokens += [self.text_tokenizer[""]] - prefix_mask += [0] - - return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask - - - def decode(self, text_ids: list[int]) -> str: - text = self.text_tokenizer.decode(text_ids) - text = text.replace("", "\n") - text = text.replace("<|tab|>", "\t") - text = re.sub(r"<\|blank_(\d\d?)\|>", self.replace_blank_with_spaces, text) - return text - def replace_spaces_with_blank(match: re.Match[str]): - return f"<|blank_{len(match.group())}|>" - - def replace_blank_with_spaces(match: re.Match[str]): - return " " * int(match.group(1)) - -################################################################################# - - -def chat_template(history: list[tuple[str, str]], current: str): - prompt = "" - chat_round = 0 - for question, answer in history: - prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" - chat_round += 1 - prompt += f"[Round {chat_round}]\n问:{current}\n答:" - return prompt - -def process_response(response: str): - response = response.strip() - response = response.replace("[[训练时间]]", "2023年") - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) - return response - -################################################################################# - - -def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False): - """ - 多线程方法 - 函数的说明请见 request_llm/bridge_all.py - """ - if glm_onnx_handle is None: - glm_onnx_handle = GetGLMHandle() - if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glm_onnx_handle.info - if not glm_onnx_handle.success: - error = glm_onnx_handle.info - glm_onnx_handle = None - raise RuntimeError(error) - - # ChatGLM_onnx doesn't have a sys_prompt interface, so add the prompt to history - history_feedin = [] - history_feedin.append(["What can I do?", sys_prompt]) - for i in range(len(history) // 2): - history_feedin.append([history[2 * i], history[2 * i + 1]]) - - watch_dog_patience = 5 # Watchdog patience, set to 5 seconds - response = "" - for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin): - print(response) - if len(observe_window) >= 1: - observe_window[0] = response - if len(observe_window) >= 2: - if (time.time() - observe_window[1]) > watch_dog_patience: - raise RuntimeError("程序终止。") - return response - -def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream=True, additional_fn=None): - """ - 单线程方法 - 函数的说明请见 request_llm/bridge_all.py - """ - chatbot.append((inputs, "")) - - global glm_onnx_handle - if glm_onnx_handle is None: - glm_onnx_handle = GetGLMHandle() - chatbot[-1] = (inputs, load_message + "\n\n" + glm_onnx_handle.info) - yield from update_ui(chatbot=chatbot, history=[]) - if not glm_onnx_handle.success: - glm_onnx_handle = None - return - - if additional_fn is not None: - import core_functional - importlib.reload(core_functional) # Hot-reload prompt - core_functional = core_functional.get_core_functions() - if "PreProcess" in core_functional[additional_fn]: - inputs = core_functional[additional_fn]["PreProcess"](inputs) - inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"] - - history_feedin = [] - history_feedin.append(["What can I do?", system_prompt]) - for i in range(len(history) // 2): - history_feedin.append([history[2 * i], history[2 * i + 1]]) - - response = "[Local Message]: 等待ChatGLM_onnx响应中 ..." - for response in glm_onnx_handle.stream_chat(query=inputs, history=history_feedin): - chatbot[-1] = (inputs, response) - yield from update_ui(chatbot=chatbot, history=history) - - if response == "[Local Message]: 等待ChatGLM_onnx响应中 ...": - response = "[Local Message]: ChatGLM_onnx响应异常 ..." - history.extend([inputs, response]) - yield from update_ui(chatbot=chatbot, history=history) - - - - diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py new file mode 100644 index 0000000..4d9844a --- /dev/null +++ b/request_llm/bridge_chatglmonnx.py @@ -0,0 +1,308 @@ +model_name = "ChatGLM-ONNX" +cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`" + + +from transformers import AutoModel, AutoTokenizer +import time +import threading +import importlib +from toolbox import update_ui, get_conf +from multiprocessing import Process, Pipe +from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM + + + + + + + + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py +# ------------------------------------------------------------------------------------------------------------------------ +import re +import numpy as np +# import torch +from onnxruntime import InferenceSession, SessionOptions + + +# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, +# although they are documented as supported on CUDA. +providers = ["CPUExecutionProvider"] + +# if torch.cuda.is_available(): +# providers = ["CUDAExecutionProvider"] + providers + + +# Default paths +tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model" +onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" + + +# input & output names +past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] +present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] +output_names = ["logits"] + present_names + + +# default kv_cache for first inference +default_past_key_values = { + k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names +} + + +def chat_template(history: list[tuple[str, str]], current: str): + prompt = "" + chat_round = 0 + for question, answer in history: + prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" + chat_round += 1 + prompt += f"[Round {chat_round}]\n问:{current}\n答:" + return prompt + + +def process_response(response: str): + response = response.strip() + response = response.replace("[[训练时间]]", "2023年") + punkts = [ + [",", ","], + ["!", "!"], + [":", ":"], + [";", ";"], + ["\?", "?"], + ] + for item in punkts: + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) + return response + + +class ChatGLMModel(): + + def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: + self.tokenizer = ChatGLMTokenizer(tokenizer_path) + options = SessionOptions() + options.enable_profiling = profile + self.session = InferenceSession(onnx_model_path, options, providers=providers) + self.eop_token_id = self.tokenizer[""] + + + def prepare_input(self, prompt: str): + input_ids, prefix_mask = self.tokenizer.encode(prompt) + + input_ids = np.array([input_ids], dtype=np.longlong) + prefix_mask = np.array([prefix_mask], dtype=np.longlong) + + return input_ids, prefix_mask, default_past_key_values + + + def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): + # softmax with temperature + exp_logits = np.exp(logits / temperature) + probs = exp_logits / np.sum(exp_logits) + + # top k + top_k_idx = np.argsort(-probs)[:top_k] + top_k_probs = probs[top_k_idx] + + # top p + cumsum_probs = np.cumsum(top_k_probs) + top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 + top_k_probs = top_k_probs / np.sum(top_k_probs) + + # sample + next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) + return next_token[0].item() + + + def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): + input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) + output_tokens = [] + + while True: + inputs = { + "input_ids": input_ids, + "prefix_mask": prefix_mask, + "use_past": np.array(len(output_tokens) > 0), + } + inputs.update(past_key_values) + + logits, *past_key_values = self.session.run(output_names, inputs) + past_key_values = { k: v for k, v in zip(past_names, past_key_values) } + + next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) + + output_tokens += [next_token] + + if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: + break + + input_ids = np.array([[next_token]], dtype=np.longlong) + prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) + + yield process_response(self.tokenizer.decode(output_tokens)) + + return process_response(self.tokenizer.decode(output_tokens)) + + + + + + + + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py +# ------------------------------------------------------------------------------------------------------------------------ + +import re +from sentencepiece import SentencePieceProcessor + + +def replace_spaces_with_blank(match: re.Match[str]): + return f"<|blank_{len(match.group())}|>" + + +def replace_blank_with_spaces(match: re.Match[str]): + return " " * int(match.group(1)) + + +class ChatGLMTokenizer: + def __init__(self, vocab_file): + assert vocab_file is not None + self.vocab_file = vocab_file + self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] + self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) + + def __len__(self): + return len(self.text_tokenizer) + + def __getitem__(self, key: str): + return self.text_tokenizer[key] + + + def preprocess(self, text: str, linebreak=True, whitespaces=True): + if linebreak: + text = text.replace("\n", "") + if whitespaces: + text = text.replace("\t", "<|tab|>") + text = re.sub(r" {2,80}", replace_spaces_with_blank, text) + return text + + + def encode( + self, text: str, text_pair: str = None, + linebreak=True, whitespaces=True, + add_dummy_prefix=True, special_tokens=True, + ) -> tuple[list[int], list[int]]: + """ + text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. + text_pair: causal LM part. + linebreak: Whether to encode newline (\n) in text. + whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. + special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. + add_dummy_prefix: Whether to add dummy blank space in the beginning. + """ + text = self.preprocess(text, linebreak, whitespaces) + if not add_dummy_prefix: + text = "" + text + + tokens = self.text_tokenizer.encode(text) + prefix_mask = [1] * len(tokens) + if special_tokens: + tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] + prefix_mask += [1, 0] + + if text_pair is not None: + text_pair = self.preprocess(text_pair, linebreak, whitespaces) + pair_tokens = self.text_tokenizer.encode(text_pair) + tokens += pair_tokens + prefix_mask += [0] * len(pair_tokens) + if special_tokens: + tokens += [self.text_tokenizer[""]] + prefix_mask += [0] + + return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask + + + def decode(self, text_ids: list[int]) -> str: + text = self.text_tokenizer.decode(text_ids) + text = text.replace("", "\n") + text = text.replace("<|tab|>", "\t") + text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text) + return text + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Local Model +# ------------------------------------------------------------------------------------------------------------------------ +@SingletonLocalLLM +class GetONNXGLMHandle(LocalLLMHandle): + + def load_model_info(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + self.model_name = model_name + self.cmd_to_install = cmd_to_install + + def load_model_and_tokenizer(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + import os, glob + if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件 + from huggingface_hub import snapshot_download + snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8") + def create_model(): + return ChatGLMModel( + tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model", + onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" + ) + self._model = create_model() + return self._model, None + + def llm_stream_generator(self, **kwargs): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + def adaptor(kwargs): + model = self._model + tokenizer = self._tokenizer + prompt = kwargs['query'] + max_length = kwargs['max_length'] + top_p = kwargs['top_p'] + temperature = kwargs['temperature'] + history = kwargs['history'] + real_prompt = combine_history(prompt, history) + return model, tokenizer, real_prompt, max_length, top_p, temperature + + model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs) + + prompt = chat_template(history, question) + for answer in self._model.generate_iterate( + prompt, + max_generated_tokens=max_length, + top_k=1, + top_p=top_p, + temperature=temperature, + ): + yield answer + + def try_to_import_special_deps(self, **kwargs): + # import something that will raise error if the user does not install requirement_*.txt + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + pass + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 GPT-Academic Interface +# ------------------------------------------------------------------------------------------------------------------------ +predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name) \ No newline at end of file diff --git a/request_llm/bridge_internlm.py b/request_llm/bridge_internlm.py index a0ba3ba..804edc8 100644 --- a/request_llm/bridge_internlm.py +++ b/request_llm/bridge_internlm.py @@ -1,23 +1,25 @@ +model_name = "InternLM" +cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`" from transformers import AutoModel, AutoTokenizer import time import threading import importlib -from toolbox import update_ui, get_conf, Singleton +from toolbox import update_ui, get_conf from multiprocessing import Process, Pipe +from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM -model_name = "InternLM" -cmd_to_install = "`pip install ???`" -load_message = f"{model_name}尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,{model_name}消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……" + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Local Model Utils +# ------------------------------------------------------------------------------------------------------------------------ def try_to_import_special_deps(): import sentencepiece -user_prompt = "<|User|>:{user}\n" -robot_prompt = "<|Bot|>:{robot}\n" -cur_query_prompt = "<|User|>:{user}\n<|Bot|>:" - - def combine_history(prompt, hist): + user_prompt = "<|User|>:{user}\n" + robot_prompt = "<|Bot|>:{robot}\n" + cur_query_prompt = "<|User|>:{user}\n<|Bot|>:" messages = hist total_prompt = "" for message in messages: @@ -29,24 +31,22 @@ def combine_history(prompt, hist): total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt) return total_prompt +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Local Model +# ------------------------------------------------------------------------------------------------------------------------ +@SingletonLocalLLM +class GetInternlmHandle(LocalLLMHandle): -@Singleton -class GetInternlmHandle(Process): - def __init__(self): - # ⭐主进程执行 - super().__init__(daemon=True) - self.parent, self.child = Pipe() - self._model = None - self._tokenizer = None - self.info = "" - self.success = True - self.check_dependency() - self.start() - self.threadLock = threading.Lock() + def load_model_info(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + self.model_name = model_name + self.cmd_to_install = cmd_to_install - def ready(self): - # ⭐主进程执行 - return self._model is not None + def try_to_import_special_deps(self, **kwargs): + """ + import something that will raise error if the user does not install requirement_*.txt + """ + import sentencepiece def load_model_and_tokenizer(self): # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 @@ -195,118 +195,8 @@ class GetInternlmHandle(Process): if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): return - - - def check_dependency(self): - # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 - try: - try_to_import_special_deps() - self.info = "依赖检测通过" - self.success = True - except: - self.info = f"缺少{model_name}的依赖,如果要使用{model_name},除了基础的pip依赖以外,您还需要运行{cmd_to_install}安装{model_name}的依赖。" - self.success = False - - def run(self): - # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 - # 第一次运行,加载参数 - try: - self._model, self._tokenizer = self.load_model_and_tokenizer() - except: - from toolbox import trimmed_format_exc - self.child.send(f'[Local Message] 不能正常加载{model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n') - raise RuntimeError(f"不能正常加载{model_name}的参数!") - - while True: - # 进入任务等待状态 - kwargs = self.child.recv() - # 收到消息,开始请求 - try: - for response_full in self.llm_stream_generator(**kwargs): - self.child.send(response_full) - except: - from toolbox import trimmed_format_exc - self.child.send(f'[Local Message] 调用{model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n') - # 请求处理结束,开始下一个循环 - self.child.send('[Finish]') - - def stream_chat(self, **kwargs): - # ⭐主进程执行 - self.threadLock.acquire() - self.parent.send(kwargs) - while True: - res = self.parent.recv() - if res != '[Finish]': - yield res - else: - break - self.threadLock.release() - # ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 GPT-Academic +# 🔌💻 GPT-Academic Interface # ------------------------------------------------------------------------------------------------------------------------ -def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False): - """ - ⭐多线程方法 - 函数的说明请见 request_llm/bridge_all.py - """ - _llm_handle = GetInternlmHandle() - if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info - if not _llm_handle.success: - error = _llm_handle.info - _llm_handle = None - raise RuntimeError(error) - - # chatglm 没有 sys_prompt 接口,因此把prompt加入 history - history_feedin = [] - history_feedin.append(["What can I do?", sys_prompt]) - for i in range(len(history)//2): - history_feedin.append([history[2*i], history[2*i+1]] ) - - watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可 - response = "" - for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): - if len(observe_window) >= 1: observe_window[0] = response - if len(observe_window) >= 2: - if (time.time()-observe_window[1]) > watch_dog_patience: - raise RuntimeError("程序终止。") - return response - - - -def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None): - """ - ⭐单线程方法 - 函数的说明请见 request_llm/bridge_all.py - """ - chatbot.append((inputs, "")) - - _llm_handle = GetInternlmHandle() - chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info) - yield from update_ui(chatbot=chatbot, history=[]) - if not _llm_handle.success: - _llm_handle = None - return - - if additional_fn is not None: - from core_functional import handle_core_functionality - inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot) - - # 处理历史信息 - history_feedin = [] - history_feedin.append(["What can I do?", system_prompt] ) - for i in range(len(history)//2): - history_feedin.append([history[2*i], history[2*i+1]] ) - - # 开始接收chatglm的回复 - response = f"[Local Message]: 等待{model_name}响应中 ..." - for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): - chatbot[-1] = (inputs, response) - yield from update_ui(chatbot=chatbot, history=history) - - # 总结输出 - if response == f"[Local Message]: 等待{model_name}响应中 ...": - response = f"[Local Message]: {model_name}响应异常 ..." - history.extend([inputs, response]) - yield from update_ui(chatbot=chatbot, history=history) +predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name) \ No newline at end of file diff --git a/request_llm/local_llm_class.py b/request_llm/local_llm_class.py new file mode 100644 index 0000000..1470717 --- /dev/null +++ b/request_llm/local_llm_class.py @@ -0,0 +1,178 @@ +from transformers import AutoModel, AutoTokenizer +import time +import threading +import importlib +from toolbox import update_ui, get_conf, Singleton +from multiprocessing import Process, Pipe + +def SingletonLocalLLM(cls): + """ + 一个单实例装饰器 + """ + _instance = {} + def _singleton(*args, **kargs): + if cls not in _instance: + _instance[cls] = cls(*args, **kargs) + return _instance[cls] + elif _instance[cls].corrupted: + _instance[cls] = cls(*args, **kargs) + return _instance[cls] + else: + return _instance[cls] + return _singleton + +class LocalLLMHandle(Process): + def __init__(self): + # ⭐主进程执行 + super().__init__(daemon=True) + self.corrupted = False + self.load_model_info() + self.parent, self.child = Pipe() + self.running = True + self._model = None + self._tokenizer = None + self.info = "" + self.check_dependency() + self.start() + self.threadLock = threading.Lock() + + def load_model_info(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + raise NotImplementedError("Method not implemented yet") + self.model_name = "" + self.cmd_to_install = "" + + def load_model_and_tokenizer(self): + """ + This function should return the model and the tokenizer + """ + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + raise NotImplementedError("Method not implemented yet") + + def llm_stream_generator(self, **kwargs): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + raise NotImplementedError("Method not implemented yet") + + def try_to_import_special_deps(self, **kwargs): + """ + import something that will raise error if the user does not install requirement_*.txt + """ + # ⭐主进程执行 + raise NotImplementedError("Method not implemented yet") + + def check_dependency(self): + # ⭐主进程执行 + try: + self.try_to_import_special_deps() + self.info = "依赖检测通过" + self.running = True + except: + self.info = f"缺少{self.model_name}的依赖,如果要使用{self.model_name},除了基础的pip依赖以外,您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。" + self.running = False + + def run(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + # 第一次运行,加载参数 + try: + self._model, self._tokenizer = self.load_model_and_tokenizer() + except: + self.running = False + from toolbox import trimmed_format_exc + self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n') + self.child.send('[FinishBad]') + raise RuntimeError(f"不能正常加载{self.model_name}的参数!") + + while True: + # 进入任务等待状态 + kwargs = self.child.recv() + # 收到消息,开始请求 + try: + for response_full in self.llm_stream_generator(**kwargs): + self.child.send(response_full) + self.child.send('[Finish]') + # 请求处理结束,开始下一个循环 + except: + from toolbox import trimmed_format_exc + self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n') + self.child.send('[Finish]') + + def stream_chat(self, **kwargs): + # ⭐主进程执行 + self.threadLock.acquire() + self.parent.send(kwargs) + while True: + res = self.parent.recv() + if res == '[Finish]': + break + if res == '[FinishBad]': + self.running = False + self.corrupted = True + break + else: + yield res + self.threadLock.release() + + + +def get_local_llm_predict_fns(LLMSingletonClass, model_name): + load_message = f"{model_name}尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,{model_name}消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……" + + def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False): + """ + ⭐多线程方法 + 函数的说明请见 request_llm/bridge_all.py + """ + _llm_handle = LLMSingletonClass() + if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info + + # chatglm 没有 sys_prompt 接口,因此把prompt加入 history + history_feedin = [] + history_feedin.append(["What can I do?", sys_prompt]) + for i in range(len(history)//2): + history_feedin.append([history[2*i], history[2*i+1]] ) + + watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可 + response = "" + for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): + if len(observe_window) >= 1: + observe_window[0] = response + if len(observe_window) >= 2: + if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。") + return response + + + + def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None): + """ + ⭐单线程方法 + 函数的说明请见 request_llm/bridge_all.py + """ + chatbot.append((inputs, "")) + + _llm_handle = LLMSingletonClass() + chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info) + yield from update_ui(chatbot=chatbot, history=[]) + + if additional_fn is not None: + from core_functional import handle_core_functionality + inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot) + + # 处理历史信息 + history_feedin = [] + history_feedin.append(["What can I do?", system_prompt] ) + for i in range(len(history)//2): + history_feedin.append([history[2*i], history[2*i+1]] ) + + # 开始接收回复 + response = f"[Local Message]: 等待{model_name}响应中 ..." + for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): + chatbot[-1] = (inputs, response) + yield from update_ui(chatbot=chatbot, history=history) + + # 总结输出 + if response == f"[Local Message]: 等待{model_name}响应中 ...": + response = f"[Local Message]: {model_name}响应异常 ..." + history.extend([inputs, response]) + yield from update_ui(chatbot=chatbot, history=history) + + return predict_no_ui_long_connection, predict \ No newline at end of file diff --git a/request_llm/requirements_chatglm.txt b/request_llm/requirements_chatglm.txt index fa049ca..b2629f8 100644 --- a/request_llm/requirements_chatglm.txt +++ b/request_llm/requirements_chatglm.txt @@ -1,5 +1,5 @@ protobuf -transformers==4.27.1 +transformers>=4.27.1 cpm_kernels torch>=1.10 mdtex2html diff --git a/request_llm/requirements_chatglm_onnx.txt b/request_llm/requirements_chatglm_onnx.txt index de072bd..70ab668 100644 --- a/request_llm/requirements_chatglm_onnx.txt +++ b/request_llm/requirements_chatglm_onnx.txt @@ -1,5 +1,5 @@ protobuf -transformers==4.27.1 +transformers>=4.27.1 cpm_kernels torch>=1.10 mdtex2html From 57d4541d4ee8f380358f7b1aab878eb39fa8b3c5 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 7 Aug 2023 01:07:55 +0800 Subject: [PATCH 3/4] fix minor bug in chatglm-onnx --- request_llm/bridge_chatglmonnx.py | 245 +----------------------------- request_llm/chatglmoonx.py | 229 ++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 240 deletions(-) create mode 100644 request_llm/chatglmoonx.py diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py index 4d9844a..cde802a 100644 --- a/request_llm/bridge_chatglmonnx.py +++ b/request_llm/bridge_chatglmonnx.py @@ -10,239 +10,7 @@ from toolbox import update_ui, get_conf from multiprocessing import Process, Pipe from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM - - - - - - - - - - - - - -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py -# ------------------------------------------------------------------------------------------------------------------------ -import re -import numpy as np -# import torch -from onnxruntime import InferenceSession, SessionOptions - - -# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, -# although they are documented as supported on CUDA. -providers = ["CPUExecutionProvider"] - -# if torch.cuda.is_available(): -# providers = ["CUDAExecutionProvider"] + providers - - -# Default paths -tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model" -onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" - - -# input & output names -past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] -present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] -output_names = ["logits"] + present_names - - -# default kv_cache for first inference -default_past_key_values = { - k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names -} - - -def chat_template(history: list[tuple[str, str]], current: str): - prompt = "" - chat_round = 0 - for question, answer in history: - prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" - chat_round += 1 - prompt += f"[Round {chat_round}]\n问:{current}\n答:" - return prompt - - -def process_response(response: str): - response = response.strip() - response = response.replace("[[训练时间]]", "2023年") - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) - return response - - -class ChatGLMModel(): - - def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: - self.tokenizer = ChatGLMTokenizer(tokenizer_path) - options = SessionOptions() - options.enable_profiling = profile - self.session = InferenceSession(onnx_model_path, options, providers=providers) - self.eop_token_id = self.tokenizer[""] - - - def prepare_input(self, prompt: str): - input_ids, prefix_mask = self.tokenizer.encode(prompt) - - input_ids = np.array([input_ids], dtype=np.longlong) - prefix_mask = np.array([prefix_mask], dtype=np.longlong) - - return input_ids, prefix_mask, default_past_key_values - - - def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): - # softmax with temperature - exp_logits = np.exp(logits / temperature) - probs = exp_logits / np.sum(exp_logits) - - # top k - top_k_idx = np.argsort(-probs)[:top_k] - top_k_probs = probs[top_k_idx] - - # top p - cumsum_probs = np.cumsum(top_k_probs) - top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 - top_k_probs = top_k_probs / np.sum(top_k_probs) - - # sample - next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) - return next_token[0].item() - - - def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): - input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) - output_tokens = [] - - while True: - inputs = { - "input_ids": input_ids, - "prefix_mask": prefix_mask, - "use_past": np.array(len(output_tokens) > 0), - } - inputs.update(past_key_values) - - logits, *past_key_values = self.session.run(output_names, inputs) - past_key_values = { k: v for k, v in zip(past_names, past_key_values) } - - next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) - - output_tokens += [next_token] - - if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: - break - - input_ids = np.array([[next_token]], dtype=np.longlong) - prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) - - yield process_response(self.tokenizer.decode(output_tokens)) - - return process_response(self.tokenizer.decode(output_tokens)) - - - - - - - - - - - - - - -# ------------------------------------------------------------------------------------------------------------------------ -# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py -# ------------------------------------------------------------------------------------------------------------------------ - -import re -from sentencepiece import SentencePieceProcessor - - -def replace_spaces_with_blank(match: re.Match[str]): - return f"<|blank_{len(match.group())}|>" - - -def replace_blank_with_spaces(match: re.Match[str]): - return " " * int(match.group(1)) - - -class ChatGLMTokenizer: - def __init__(self, vocab_file): - assert vocab_file is not None - self.vocab_file = vocab_file - self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] - self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) - - def __len__(self): - return len(self.text_tokenizer) - - def __getitem__(self, key: str): - return self.text_tokenizer[key] - - - def preprocess(self, text: str, linebreak=True, whitespaces=True): - if linebreak: - text = text.replace("\n", "") - if whitespaces: - text = text.replace("\t", "<|tab|>") - text = re.sub(r" {2,80}", replace_spaces_with_blank, text) - return text - - - def encode( - self, text: str, text_pair: str = None, - linebreak=True, whitespaces=True, - add_dummy_prefix=True, special_tokens=True, - ) -> tuple[list[int], list[int]]: - """ - text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. - text_pair: causal LM part. - linebreak: Whether to encode newline (\n) in text. - whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. - special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. - add_dummy_prefix: Whether to add dummy blank space in the beginning. - """ - text = self.preprocess(text, linebreak, whitespaces) - if not add_dummy_prefix: - text = "" + text - - tokens = self.text_tokenizer.encode(text) - prefix_mask = [1] * len(tokens) - if special_tokens: - tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] - prefix_mask += [1, 0] - - if text_pair is not None: - text_pair = self.preprocess(text_pair, linebreak, whitespaces) - pair_tokens = self.text_tokenizer.encode(text_pair) - tokens += pair_tokens - prefix_mask += [0] * len(pair_tokens) - if special_tokens: - tokens += [self.text_tokenizer[""]] - prefix_mask += [0] - - return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask - - - def decode(self, text_ids: list[int]) -> str: - text = self.text_tokenizer.decode(text_ids) - text = text.replace("", "\n") - text = text.replace("<|tab|>", "\t") - text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text) - return text +from .chatglmoonx import ChatGLMModel, chat_template @@ -274,19 +42,16 @@ class GetONNXGLMHandle(LocalLLMHandle): def llm_stream_generator(self, **kwargs): # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 def adaptor(kwargs): - model = self._model - tokenizer = self._tokenizer - prompt = kwargs['query'] + query = kwargs['query'] max_length = kwargs['max_length'] top_p = kwargs['top_p'] temperature = kwargs['temperature'] history = kwargs['history'] - real_prompt = combine_history(prompt, history) - return model, tokenizer, real_prompt, max_length, top_p, temperature + return query, max_length, top_p, temperature, history - model, tokenizer, prompt, max_length, top_p, temperature = adaptor(kwargs) + query, max_length, top_p, temperature, history = adaptor(kwargs) - prompt = chat_template(history, question) + prompt = chat_template(history, query) for answer in self._model.generate_iterate( prompt, max_generated_tokens=max_length, diff --git a/request_llm/chatglmoonx.py b/request_llm/chatglmoonx.py new file mode 100644 index 0000000..444181e --- /dev/null +++ b/request_llm/chatglmoonx.py @@ -0,0 +1,229 @@ + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py +# ------------------------------------------------------------------------------------------------------------------------ +import re +import numpy as np +# import torch +from onnxruntime import InferenceSession, SessionOptions + + +# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU, +# although they are documented as supported on CUDA. +providers = ["CPUExecutionProvider"] + +# if torch.cuda.is_available(): +# providers = ["CUDAExecutionProvider"] + providers + + +# Default paths +tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model" +onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx" + + +# input & output names +past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]] +present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]] +output_names = ["logits"] + present_names + + +# default kv_cache for first inference +default_past_key_values = { + k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names +} + + +def chat_template(history: list[tuple[str, str]], current: str): + prompt = "" + chat_round = 0 + for question, answer in history: + prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n" + chat_round += 1 + prompt += f"[Round {chat_round}]\n问:{current}\n答:" + return prompt + + +def process_response(response: str): + response = response.strip() + response = response.replace("[[训练时间]]", "2023年") + punkts = [ + [",", ","], + ["!", "!"], + [":", ":"], + [";", ";"], + ["\?", "?"], + ] + for item in punkts: + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) + return response + + +class ChatGLMModel(): + + def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None: + self.tokenizer = ChatGLMTokenizer(tokenizer_path) + options = SessionOptions() + options.enable_profiling = profile + self.session = InferenceSession(onnx_model_path, options, providers=providers) + self.eop_token_id = self.tokenizer[""] + + + def prepare_input(self, prompt: str): + input_ids, prefix_mask = self.tokenizer.encode(prompt) + + input_ids = np.array([input_ids], dtype=np.longlong) + prefix_mask = np.array([prefix_mask], dtype=np.longlong) + + return input_ids, prefix_mask, default_past_key_values + + + def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1): + # softmax with temperature + exp_logits = np.exp(logits / temperature) + probs = exp_logits / np.sum(exp_logits) + + # top k + top_k_idx = np.argsort(-probs)[:top_k] + top_k_probs = probs[top_k_idx] + + # top p + cumsum_probs = np.cumsum(top_k_probs) + top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0 + top_k_probs = top_k_probs / np.sum(top_k_probs) + + # sample + next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs) + return next_token[0].item() + + + def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1): + input_ids, prefix_mask, past_key_values = self.prepare_input(prompt) + output_tokens = [] + + while True: + inputs = { + "input_ids": input_ids, + "prefix_mask": prefix_mask, + "use_past": np.array(len(output_tokens) > 0), + } + inputs.update(past_key_values) + + logits, *past_key_values = self.session.run(output_names, inputs) + past_key_values = { k: v for k, v in zip(past_names, past_key_values) } + + next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature) + + output_tokens += [next_token] + + if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens: + break + + input_ids = np.array([[next_token]], dtype=np.longlong) + prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1) + + yield process_response(self.tokenizer.decode(output_tokens)) + + return process_response(self.tokenizer.decode(output_tokens)) + + + + + + + + + + + + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py +# ------------------------------------------------------------------------------------------------------------------------ + +import re +from sentencepiece import SentencePieceProcessor + + +def replace_spaces_with_blank(match: re.Match[str]): + return f"<|blank_{len(match.group())}|>" + + +def replace_blank_with_spaces(match: re.Match[str]): + return " " * int(match.group(1)) + + +class ChatGLMTokenizer: + def __init__(self, vocab_file): + assert vocab_file is not None + self.vocab_file = vocab_file + self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "", "", "", "", ""] + self.text_tokenizer = SentencePieceProcessor(str(vocab_file)) + + def __len__(self): + return len(self.text_tokenizer) + + def __getitem__(self, key: str): + return self.text_tokenizer[key] + + + def preprocess(self, text: str, linebreak=True, whitespaces=True): + if linebreak: + text = text.replace("\n", "") + if whitespaces: + text = text.replace("\t", "<|tab|>") + text = re.sub(r" {2,80}", replace_spaces_with_blank, text) + return text + + + def encode( + self, text: str, text_pair: str = None, + linebreak=True, whitespaces=True, + add_dummy_prefix=True, special_tokens=True, + ) -> tuple[list[int], list[int]]: + """ + text: Text to encode. Bidirectional part with a [gMASK] and an for causal LM. + text_pair: causal LM part. + linebreak: Whether to encode newline (\n) in text. + whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding. + special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text. + add_dummy_prefix: Whether to add dummy blank space in the beginning. + """ + text = self.preprocess(text, linebreak, whitespaces) + if not add_dummy_prefix: + text = "" + text + + tokens = self.text_tokenizer.encode(text) + prefix_mask = [1] * len(tokens) + if special_tokens: + tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer[""]] + prefix_mask += [1, 0] + + if text_pair is not None: + text_pair = self.preprocess(text_pair, linebreak, whitespaces) + pair_tokens = self.text_tokenizer.encode(text_pair) + tokens += pair_tokens + prefix_mask += [0] * len(pair_tokens) + if special_tokens: + tokens += [self.text_tokenizer[""]] + prefix_mask += [0] + + return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask + + + def decode(self, text_ids: list[int]) -> str: + text = self.text_tokenizer.decode(text_ids) + text = text.replace("", "\n") + text = text.replace("<|tab|>", "\t") + text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text) + return text + + From 0a37106692f01bcedcf19f4c2aea1dc8b954faea Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 7 Aug 2023 01:11:44 +0800 Subject: [PATCH 4/4] reverse cmd_to_install --- config.py | 2 +- request_llm/bridge_chatglmonnx.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.py b/config.py index c12d718..1d43dd4 100644 --- a/config.py +++ b/config.py @@ -70,7 +70,7 @@ MAX_RETRY = 2 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 ) LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓ -AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "internlm", "moss", "newbing", "stack-claude"] +AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"] # P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] diff --git a/request_llm/bridge_chatglmonnx.py b/request_llm/bridge_chatglmonnx.py index cde802a..fbe64b4 100644 --- a/request_llm/bridge_chatglmonnx.py +++ b/request_llm/bridge_chatglmonnx.py @@ -1,5 +1,5 @@ model_name = "ChatGLM-ONNX" -cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`" +cmd_to_install = "`pip install request_llm/requirements_chatglm_onnx.txt`" from transformers import AutoModel, AutoTokenizer