Merge pull request #1009 from ValeriaWong/master
feat(chatglm_int8_onnx):纯CPU推理,最多仅需8GB内存,推理速度未测评,token数有限,暂时还不能流式输出 #…
This commit is contained in:
commit
9bee676cd2
1
.gitignore
vendored
1
.gitignore
vendored
@ -151,3 +151,4 @@ multi-language
|
|||||||
request_llm/moss
|
request_llm/moss
|
||||||
media
|
media
|
||||||
flagged
|
flagged
|
||||||
|
request_llm/ChatGLM-6b-onnx-u8s8
|
||||||
|
@ -71,7 +71,7 @@ MAX_RETRY = 2
|
|||||||
# 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
|
# 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
|
||||||
LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
|
LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
|
||||||
AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
|
AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
|
||||||
# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
|
# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
|
||||||
|
|
||||||
|
|
||||||
# ChatGLM(2) Finetune Model Path (如果使用ChatGLM2微调模型,需要把"chatglmft"加入AVAIL_LLM_MODELS中)
|
# ChatGLM(2) Finetune Model Path (如果使用ChatGLM2微调模型,需要把"chatglmft"加入AVAIL_LLM_MODELS中)
|
||||||
|
@ -19,9 +19,6 @@ from .bridge_chatgpt import predict as chatgpt_ui
|
|||||||
from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
|
from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
|
||||||
from .bridge_chatglm import predict as chatglm_ui
|
from .bridge_chatglm import predict as chatglm_ui
|
||||||
|
|
||||||
# from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
|
|
||||||
# from .bridge_tgui import predict as tgui_ui
|
|
||||||
|
|
||||||
colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
|
colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
|
||||||
|
|
||||||
class LazyloadTiktoken(object):
|
class LazyloadTiktoken(object):
|
||||||
@ -322,6 +319,22 @@ if "internlm" in AVAIL_LLM_MODELS:
|
|||||||
})
|
})
|
||||||
except:
|
except:
|
||||||
print(trimmed_format_exc())
|
print(trimmed_format_exc())
|
||||||
|
if "chatglm_onnx" in AVAIL_LLM_MODELS:
|
||||||
|
try:
|
||||||
|
from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui
|
||||||
|
from .bridge_chatglmonnx import predict as chatglm_onnx_ui
|
||||||
|
model_info.update({
|
||||||
|
"chatglm_onnx": {
|
||||||
|
"fn_with_ui": chatglm_onnx_ui,
|
||||||
|
"fn_without_ui": chatglm_onnx_noui,
|
||||||
|
"endpoint": None,
|
||||||
|
"max_token": 4096,
|
||||||
|
"tokenizer": tokenizer_gpt35,
|
||||||
|
"token_cnt": get_token_num_gpt35,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
print(trimmed_format_exc())
|
||||||
|
|
||||||
def LLM_CATCH_EXCEPTION(f):
|
def LLM_CATCH_EXCEPTION(f):
|
||||||
"""
|
"""
|
||||||
|
73
request_llm/bridge_chatglmonnx.py
Normal file
73
request_llm/bridge_chatglmonnx.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
model_name = "ChatGLM-ONNX"
|
||||||
|
cmd_to_install = "`pip install request_llm/requirements_chatglm_onnx.txt`"
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import importlib
|
||||||
|
from toolbox import update_ui, get_conf
|
||||||
|
from multiprocessing import Process, Pipe
|
||||||
|
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||||
|
|
||||||
|
from .chatglmoonx import ChatGLMModel, chat_template
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
# 🔌💻 Local Model
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
@SingletonLocalLLM
|
||||||
|
class GetONNXGLMHandle(LocalLLMHandle):
|
||||||
|
|
||||||
|
def load_model_info(self):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
self.model_name = model_name
|
||||||
|
self.cmd_to_install = cmd_to_install
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
import os, glob
|
||||||
|
if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8")
|
||||||
|
def create_model():
|
||||||
|
return ChatGLMModel(
|
||||||
|
tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model",
|
||||||
|
onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
|
||||||
|
)
|
||||||
|
self._model = create_model()
|
||||||
|
return self._model, None
|
||||||
|
|
||||||
|
def llm_stream_generator(self, **kwargs):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
def adaptor(kwargs):
|
||||||
|
query = kwargs['query']
|
||||||
|
max_length = kwargs['max_length']
|
||||||
|
top_p = kwargs['top_p']
|
||||||
|
temperature = kwargs['temperature']
|
||||||
|
history = kwargs['history']
|
||||||
|
return query, max_length, top_p, temperature, history
|
||||||
|
|
||||||
|
query, max_length, top_p, temperature, history = adaptor(kwargs)
|
||||||
|
|
||||||
|
prompt = chat_template(history, query)
|
||||||
|
for answer in self._model.generate_iterate(
|
||||||
|
prompt,
|
||||||
|
max_generated_tokens=max_length,
|
||||||
|
top_k=1,
|
||||||
|
top_p=top_p,
|
||||||
|
temperature=temperature,
|
||||||
|
):
|
||||||
|
yield answer
|
||||||
|
|
||||||
|
def try_to_import_special_deps(self, **kwargs):
|
||||||
|
# import something that will raise error if the user does not install requirement_*.txt
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
# 🔌💻 GPT-Academic Interface
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
|
@ -1,23 +1,25 @@
|
|||||||
|
model_name = "InternLM"
|
||||||
|
cmd_to_install = "`pip install request_llm/requirements_chatglm.txt`"
|
||||||
|
|
||||||
from transformers import AutoModel, AutoTokenizer
|
from transformers import AutoModel, AutoTokenizer
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
import importlib
|
import importlib
|
||||||
from toolbox import update_ui, get_conf, Singleton
|
from toolbox import update_ui, get_conf
|
||||||
from multiprocessing import Process, Pipe
|
from multiprocessing import Process, Pipe
|
||||||
|
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||||
|
|
||||||
model_name = "InternLM"
|
|
||||||
cmd_to_install = "`pip install ???`"
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
load_message = f"{model_name}尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,{model_name}消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……"
|
# 🔌💻 Local Model Utils
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
def try_to_import_special_deps():
|
def try_to_import_special_deps():
|
||||||
import sentencepiece
|
import sentencepiece
|
||||||
|
|
||||||
user_prompt = "<|User|>:{user}<eoh>\n"
|
|
||||||
robot_prompt = "<|Bot|>:{robot}<eoa>\n"
|
|
||||||
cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
|
|
||||||
|
|
||||||
|
|
||||||
def combine_history(prompt, hist):
|
def combine_history(prompt, hist):
|
||||||
|
user_prompt = "<|User|>:{user}<eoh>\n"
|
||||||
|
robot_prompt = "<|Bot|>:{robot}<eoa>\n"
|
||||||
|
cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
|
||||||
messages = hist
|
messages = hist
|
||||||
total_prompt = ""
|
total_prompt = ""
|
||||||
for message in messages:
|
for message in messages:
|
||||||
@ -29,24 +31,22 @@ def combine_history(prompt, hist):
|
|||||||
total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
|
total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
|
||||||
return total_prompt
|
return total_prompt
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
# 🔌💻 Local Model
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
@SingletonLocalLLM
|
||||||
|
class GetInternlmHandle(LocalLLMHandle):
|
||||||
|
|
||||||
@Singleton
|
def load_model_info(self):
|
||||||
class GetInternlmHandle(Process):
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
def __init__(self):
|
self.model_name = model_name
|
||||||
# ⭐主进程执行
|
self.cmd_to_install = cmd_to_install
|
||||||
super().__init__(daemon=True)
|
|
||||||
self.parent, self.child = Pipe()
|
|
||||||
self._model = None
|
|
||||||
self._tokenizer = None
|
|
||||||
self.info = ""
|
|
||||||
self.success = True
|
|
||||||
self.check_dependency()
|
|
||||||
self.start()
|
|
||||||
self.threadLock = threading.Lock()
|
|
||||||
|
|
||||||
def ready(self):
|
def try_to_import_special_deps(self, **kwargs):
|
||||||
# ⭐主进程执行
|
"""
|
||||||
return self._model is not None
|
import something that will raise error if the user does not install requirement_*.txt
|
||||||
|
"""
|
||||||
|
import sentencepiece
|
||||||
|
|
||||||
def load_model_and_tokenizer(self):
|
def load_model_and_tokenizer(self):
|
||||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
@ -196,117 +196,7 @@ class GetInternlmHandle(Process):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def check_dependency(self):
|
|
||||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
|
||||||
try:
|
|
||||||
try_to_import_special_deps()
|
|
||||||
self.info = "依赖检测通过"
|
|
||||||
self.success = True
|
|
||||||
except:
|
|
||||||
self.info = f"缺少{model_name}的依赖,如果要使用{model_name},除了基础的pip依赖以外,您还需要运行{cmd_to_install}安装{model_name}的依赖。"
|
|
||||||
self.success = False
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
|
||||||
# 第一次运行,加载参数
|
|
||||||
try:
|
|
||||||
self._model, self._tokenizer = self.load_model_and_tokenizer()
|
|
||||||
except:
|
|
||||||
from toolbox import trimmed_format_exc
|
|
||||||
self.child.send(f'[Local Message] 不能正常加载{model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
|
||||||
raise RuntimeError(f"不能正常加载{model_name}的参数!")
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# 进入任务等待状态
|
|
||||||
kwargs = self.child.recv()
|
|
||||||
# 收到消息,开始请求
|
|
||||||
try:
|
|
||||||
for response_full in self.llm_stream_generator(**kwargs):
|
|
||||||
self.child.send(response_full)
|
|
||||||
except:
|
|
||||||
from toolbox import trimmed_format_exc
|
|
||||||
self.child.send(f'[Local Message] 调用{model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
|
||||||
# 请求处理结束,开始下一个循环
|
|
||||||
self.child.send('[Finish]')
|
|
||||||
|
|
||||||
def stream_chat(self, **kwargs):
|
|
||||||
# ⭐主进程执行
|
|
||||||
self.threadLock.acquire()
|
|
||||||
self.parent.send(kwargs)
|
|
||||||
while True:
|
|
||||||
res = self.parent.recv()
|
|
||||||
if res != '[Finish]':
|
|
||||||
yield res
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
self.threadLock.release()
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
# 🔌💻 GPT-Academic
|
# 🔌💻 GPT-Academic Interface
|
||||||
# ------------------------------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name)
|
||||||
"""
|
|
||||||
⭐多线程方法
|
|
||||||
函数的说明请见 request_llm/bridge_all.py
|
|
||||||
"""
|
|
||||||
_llm_handle = GetInternlmHandle()
|
|
||||||
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
|
|
||||||
if not _llm_handle.success:
|
|
||||||
error = _llm_handle.info
|
|
||||||
_llm_handle = None
|
|
||||||
raise RuntimeError(error)
|
|
||||||
|
|
||||||
# chatglm 没有 sys_prompt 接口,因此把prompt加入 history
|
|
||||||
history_feedin = []
|
|
||||||
history_feedin.append(["What can I do?", sys_prompt])
|
|
||||||
for i in range(len(history)//2):
|
|
||||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
|
||||||
|
|
||||||
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
|
|
||||||
response = ""
|
|
||||||
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
|
||||||
if len(observe_window) >= 1: observe_window[0] = response
|
|
||||||
if len(observe_window) >= 2:
|
|
||||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
|
||||||
raise RuntimeError("程序终止。")
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
|
||||||
"""
|
|
||||||
⭐单线程方法
|
|
||||||
函数的说明请见 request_llm/bridge_all.py
|
|
||||||
"""
|
|
||||||
chatbot.append((inputs, ""))
|
|
||||||
|
|
||||||
_llm_handle = GetInternlmHandle()
|
|
||||||
chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
|
|
||||||
yield from update_ui(chatbot=chatbot, history=[])
|
|
||||||
if not _llm_handle.success:
|
|
||||||
_llm_handle = None
|
|
||||||
return
|
|
||||||
|
|
||||||
if additional_fn is not None:
|
|
||||||
from core_functional import handle_core_functionality
|
|
||||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
|
||||||
|
|
||||||
# 处理历史信息
|
|
||||||
history_feedin = []
|
|
||||||
history_feedin.append(["What can I do?", system_prompt] )
|
|
||||||
for i in range(len(history)//2):
|
|
||||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
|
||||||
|
|
||||||
# 开始接收chatglm的回复
|
|
||||||
response = f"[Local Message]: 等待{model_name}响应中 ..."
|
|
||||||
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
|
||||||
chatbot[-1] = (inputs, response)
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history)
|
|
||||||
|
|
||||||
# 总结输出
|
|
||||||
if response == f"[Local Message]: 等待{model_name}响应中 ...":
|
|
||||||
response = f"[Local Message]: {model_name}响应异常 ..."
|
|
||||||
history.extend([inputs, response])
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history)
|
|
229
request_llm/chatglmoonx.py
Normal file
229
request_llm/chatglmoonx.py
Normal file
@ -0,0 +1,229 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
# import torch
|
||||||
|
from onnxruntime import InferenceSession, SessionOptions
|
||||||
|
|
||||||
|
|
||||||
|
# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
|
||||||
|
# although they are documented as supported on CUDA.
|
||||||
|
providers = ["CPUExecutionProvider"]
|
||||||
|
|
||||||
|
# if torch.cuda.is_available():
|
||||||
|
# providers = ["CUDAExecutionProvider"] + providers
|
||||||
|
|
||||||
|
|
||||||
|
# Default paths
|
||||||
|
tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
|
||||||
|
onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
|
||||||
|
|
||||||
|
|
||||||
|
# input & output names
|
||||||
|
past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
|
||||||
|
present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
|
||||||
|
output_names = ["logits"] + present_names
|
||||||
|
|
||||||
|
|
||||||
|
# default kv_cache for first inference
|
||||||
|
default_past_key_values = {
|
||||||
|
k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def chat_template(history: list[tuple[str, str]], current: str):
|
||||||
|
prompt = ""
|
||||||
|
chat_round = 0
|
||||||
|
for question, answer in history:
|
||||||
|
prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n"
|
||||||
|
chat_round += 1
|
||||||
|
prompt += f"[Round {chat_round}]\n问:{current}\n答:"
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def process_response(response: str):
|
||||||
|
response = response.strip()
|
||||||
|
response = response.replace("[[训练时间]]", "2023年")
|
||||||
|
punkts = [
|
||||||
|
[",", ","],
|
||||||
|
["!", "!"],
|
||||||
|
[":", ":"],
|
||||||
|
[";", ";"],
|
||||||
|
["\?", "?"],
|
||||||
|
]
|
||||||
|
for item in punkts:
|
||||||
|
response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
|
||||||
|
response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGLMModel():
|
||||||
|
|
||||||
|
def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
|
||||||
|
self.tokenizer = ChatGLMTokenizer(tokenizer_path)
|
||||||
|
options = SessionOptions()
|
||||||
|
options.enable_profiling = profile
|
||||||
|
self.session = InferenceSession(onnx_model_path, options, providers=providers)
|
||||||
|
self.eop_token_id = self.tokenizer["<eop>"]
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_input(self, prompt: str):
|
||||||
|
input_ids, prefix_mask = self.tokenizer.encode(prompt)
|
||||||
|
|
||||||
|
input_ids = np.array([input_ids], dtype=np.longlong)
|
||||||
|
prefix_mask = np.array([prefix_mask], dtype=np.longlong)
|
||||||
|
|
||||||
|
return input_ids, prefix_mask, default_past_key_values
|
||||||
|
|
||||||
|
|
||||||
|
def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
|
||||||
|
# softmax with temperature
|
||||||
|
exp_logits = np.exp(logits / temperature)
|
||||||
|
probs = exp_logits / np.sum(exp_logits)
|
||||||
|
|
||||||
|
# top k
|
||||||
|
top_k_idx = np.argsort(-probs)[:top_k]
|
||||||
|
top_k_probs = probs[top_k_idx]
|
||||||
|
|
||||||
|
# top p
|
||||||
|
cumsum_probs = np.cumsum(top_k_probs)
|
||||||
|
top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
|
||||||
|
top_k_probs = top_k_probs / np.sum(top_k_probs)
|
||||||
|
|
||||||
|
# sample
|
||||||
|
next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
|
||||||
|
return next_token[0].item()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
|
||||||
|
input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
|
||||||
|
output_tokens = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"prefix_mask": prefix_mask,
|
||||||
|
"use_past": np.array(len(output_tokens) > 0),
|
||||||
|
}
|
||||||
|
inputs.update(past_key_values)
|
||||||
|
|
||||||
|
logits, *past_key_values = self.session.run(output_names, inputs)
|
||||||
|
past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
|
||||||
|
|
||||||
|
next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
|
||||||
|
|
||||||
|
output_tokens += [next_token]
|
||||||
|
|
||||||
|
if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
|
||||||
|
break
|
||||||
|
|
||||||
|
input_ids = np.array([[next_token]], dtype=np.longlong)
|
||||||
|
prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
|
||||||
|
|
||||||
|
yield process_response(self.tokenizer.decode(output_tokens))
|
||||||
|
|
||||||
|
return process_response(self.tokenizer.decode(output_tokens))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
|
||||||
|
# ------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
import re
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
|
||||||
|
def replace_spaces_with_blank(match: re.Match[str]):
|
||||||
|
return f"<|blank_{len(match.group())}|>"
|
||||||
|
|
||||||
|
|
||||||
|
def replace_blank_with_spaces(match: re.Match[str]):
|
||||||
|
return " " * int(match.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGLMTokenizer:
|
||||||
|
def __init__(self, vocab_file):
|
||||||
|
assert vocab_file is not None
|
||||||
|
self.vocab_file = vocab_file
|
||||||
|
self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
|
||||||
|
self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.text_tokenizer)
|
||||||
|
|
||||||
|
def __getitem__(self, key: str):
|
||||||
|
return self.text_tokenizer[key]
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(self, text: str, linebreak=True, whitespaces=True):
|
||||||
|
if linebreak:
|
||||||
|
text = text.replace("\n", "<n>")
|
||||||
|
if whitespaces:
|
||||||
|
text = text.replace("\t", "<|tab|>")
|
||||||
|
text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def encode(
|
||||||
|
self, text: str, text_pair: str = None,
|
||||||
|
linebreak=True, whitespaces=True,
|
||||||
|
add_dummy_prefix=True, special_tokens=True,
|
||||||
|
) -> tuple[list[int], list[int]]:
|
||||||
|
"""
|
||||||
|
text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
|
||||||
|
text_pair: causal LM part.
|
||||||
|
linebreak: Whether to encode newline (\n) in text.
|
||||||
|
whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
|
||||||
|
special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
|
||||||
|
add_dummy_prefix: Whether to add dummy blank space in the beginning.
|
||||||
|
"""
|
||||||
|
text = self.preprocess(text, linebreak, whitespaces)
|
||||||
|
if not add_dummy_prefix:
|
||||||
|
text = "<n>" + text
|
||||||
|
|
||||||
|
tokens = self.text_tokenizer.encode(text)
|
||||||
|
prefix_mask = [1] * len(tokens)
|
||||||
|
if special_tokens:
|
||||||
|
tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
|
||||||
|
prefix_mask += [1, 0]
|
||||||
|
|
||||||
|
if text_pair is not None:
|
||||||
|
text_pair = self.preprocess(text_pair, linebreak, whitespaces)
|
||||||
|
pair_tokens = self.text_tokenizer.encode(text_pair)
|
||||||
|
tokens += pair_tokens
|
||||||
|
prefix_mask += [0] * len(pair_tokens)
|
||||||
|
if special_tokens:
|
||||||
|
tokens += [self.text_tokenizer["<eop>"]]
|
||||||
|
prefix_mask += [0]
|
||||||
|
|
||||||
|
return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
|
||||||
|
|
||||||
|
|
||||||
|
def decode(self, text_ids: list[int]) -> str:
|
||||||
|
text = self.text_tokenizer.decode(text_ids)
|
||||||
|
text = text.replace("<n>", "\n")
|
||||||
|
text = text.replace("<|tab|>", "\t")
|
||||||
|
text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
178
request_llm/local_llm_class.py
Normal file
178
request_llm/local_llm_class.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import importlib
|
||||||
|
from toolbox import update_ui, get_conf, Singleton
|
||||||
|
from multiprocessing import Process, Pipe
|
||||||
|
|
||||||
|
def SingletonLocalLLM(cls):
|
||||||
|
"""
|
||||||
|
一个单实例装饰器
|
||||||
|
"""
|
||||||
|
_instance = {}
|
||||||
|
def _singleton(*args, **kargs):
|
||||||
|
if cls not in _instance:
|
||||||
|
_instance[cls] = cls(*args, **kargs)
|
||||||
|
return _instance[cls]
|
||||||
|
elif _instance[cls].corrupted:
|
||||||
|
_instance[cls] = cls(*args, **kargs)
|
||||||
|
return _instance[cls]
|
||||||
|
else:
|
||||||
|
return _instance[cls]
|
||||||
|
return _singleton
|
||||||
|
|
||||||
|
class LocalLLMHandle(Process):
|
||||||
|
def __init__(self):
|
||||||
|
# ⭐主进程执行
|
||||||
|
super().__init__(daemon=True)
|
||||||
|
self.corrupted = False
|
||||||
|
self.load_model_info()
|
||||||
|
self.parent, self.child = Pipe()
|
||||||
|
self.running = True
|
||||||
|
self._model = None
|
||||||
|
self._tokenizer = None
|
||||||
|
self.info = ""
|
||||||
|
self.check_dependency()
|
||||||
|
self.start()
|
||||||
|
self.threadLock = threading.Lock()
|
||||||
|
|
||||||
|
def load_model_info(self):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
raise NotImplementedError("Method not implemented yet")
|
||||||
|
self.model_name = ""
|
||||||
|
self.cmd_to_install = ""
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
"""
|
||||||
|
This function should return the model and the tokenizer
|
||||||
|
"""
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
raise NotImplementedError("Method not implemented yet")
|
||||||
|
|
||||||
|
def llm_stream_generator(self, **kwargs):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
raise NotImplementedError("Method not implemented yet")
|
||||||
|
|
||||||
|
def try_to_import_special_deps(self, **kwargs):
|
||||||
|
"""
|
||||||
|
import something that will raise error if the user does not install requirement_*.txt
|
||||||
|
"""
|
||||||
|
# ⭐主进程执行
|
||||||
|
raise NotImplementedError("Method not implemented yet")
|
||||||
|
|
||||||
|
def check_dependency(self):
|
||||||
|
# ⭐主进程执行
|
||||||
|
try:
|
||||||
|
self.try_to_import_special_deps()
|
||||||
|
self.info = "依赖检测通过"
|
||||||
|
self.running = True
|
||||||
|
except:
|
||||||
|
self.info = f"缺少{self.model_name}的依赖,如果要使用{self.model_name},除了基础的pip依赖以外,您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||||
|
# 第一次运行,加载参数
|
||||||
|
try:
|
||||||
|
self._model, self._tokenizer = self.load_model_and_tokenizer()
|
||||||
|
except:
|
||||||
|
self.running = False
|
||||||
|
from toolbox import trimmed_format_exc
|
||||||
|
self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
||||||
|
self.child.send('[FinishBad]')
|
||||||
|
raise RuntimeError(f"不能正常加载{self.model_name}的参数!")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# 进入任务等待状态
|
||||||
|
kwargs = self.child.recv()
|
||||||
|
# 收到消息,开始请求
|
||||||
|
try:
|
||||||
|
for response_full in self.llm_stream_generator(**kwargs):
|
||||||
|
self.child.send(response_full)
|
||||||
|
self.child.send('[Finish]')
|
||||||
|
# 请求处理结束,开始下一个循环
|
||||||
|
except:
|
||||||
|
from toolbox import trimmed_format_exc
|
||||||
|
self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
||||||
|
self.child.send('[Finish]')
|
||||||
|
|
||||||
|
def stream_chat(self, **kwargs):
|
||||||
|
# ⭐主进程执行
|
||||||
|
self.threadLock.acquire()
|
||||||
|
self.parent.send(kwargs)
|
||||||
|
while True:
|
||||||
|
res = self.parent.recv()
|
||||||
|
if res == '[Finish]':
|
||||||
|
break
|
||||||
|
if res == '[FinishBad]':
|
||||||
|
self.running = False
|
||||||
|
self.corrupted = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
yield res
|
||||||
|
self.threadLock.release()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_local_llm_predict_fns(LLMSingletonClass, model_name):
|
||||||
|
load_message = f"{model_name}尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,{model_name}消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……"
|
||||||
|
|
||||||
|
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
||||||
|
"""
|
||||||
|
⭐多线程方法
|
||||||
|
函数的说明请见 request_llm/bridge_all.py
|
||||||
|
"""
|
||||||
|
_llm_handle = LLMSingletonClass()
|
||||||
|
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
|
||||||
|
|
||||||
|
# chatglm 没有 sys_prompt 接口,因此把prompt加入 history
|
||||||
|
history_feedin = []
|
||||||
|
history_feedin.append(["What can I do?", sys_prompt])
|
||||||
|
for i in range(len(history)//2):
|
||||||
|
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||||
|
|
||||||
|
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
|
||||||
|
response = ""
|
||||||
|
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||||
|
if len(observe_window) >= 1:
|
||||||
|
observe_window[0] = response
|
||||||
|
if len(observe_window) >= 2:
|
||||||
|
if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||||
|
"""
|
||||||
|
⭐单线程方法
|
||||||
|
函数的说明请见 request_llm/bridge_all.py
|
||||||
|
"""
|
||||||
|
chatbot.append((inputs, ""))
|
||||||
|
|
||||||
|
_llm_handle = LLMSingletonClass()
|
||||||
|
chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
|
||||||
|
yield from update_ui(chatbot=chatbot, history=[])
|
||||||
|
|
||||||
|
if additional_fn is not None:
|
||||||
|
from core_functional import handle_core_functionality
|
||||||
|
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||||
|
|
||||||
|
# 处理历史信息
|
||||||
|
history_feedin = []
|
||||||
|
history_feedin.append(["What can I do?", system_prompt] )
|
||||||
|
for i in range(len(history)//2):
|
||||||
|
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||||
|
|
||||||
|
# 开始接收回复
|
||||||
|
response = f"[Local Message]: 等待{model_name}响应中 ..."
|
||||||
|
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||||
|
chatbot[-1] = (inputs, response)
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
|
|
||||||
|
# 总结输出
|
||||||
|
if response == f"[Local Message]: 等待{model_name}响应中 ...":
|
||||||
|
response = f"[Local Message]: {model_name}响应异常 ..."
|
||||||
|
history.extend([inputs, response])
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
|
|
||||||
|
return predict_no_ui_long_connection, predict
|
@ -1,5 +1,5 @@
|
|||||||
protobuf
|
protobuf
|
||||||
transformers==4.27.1
|
transformers>=4.27.1
|
||||||
cpm_kernels
|
cpm_kernels
|
||||||
torch>=1.10
|
torch>=1.10
|
||||||
mdtex2html
|
mdtex2html
|
||||||
|
11
request_llm/requirements_chatglm_onnx.txt
Normal file
11
request_llm/requirements_chatglm_onnx.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
protobuf
|
||||||
|
transformers>=4.27.1
|
||||||
|
cpm_kernels
|
||||||
|
torch>=1.10
|
||||||
|
mdtex2html
|
||||||
|
sentencepiece
|
||||||
|
numpy
|
||||||
|
onnxruntime
|
||||||
|
sentencepiece
|
||||||
|
streamlit
|
||||||
|
streamlit-chat
|
Loading…
x
Reference in New Issue
Block a user