better local model interaction

This commit is contained in:
binary-husky 2023-10-31 16:17:52 +08:00
parent 08f036aafd
commit 136162ec0d
3 changed files with 146 additions and 187 deletions

View File

@ -1,42 +1,29 @@
model_name = "ChatGLM"
cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"
from transformers import AutoModel, AutoTokenizer from transformers import AutoModel, AutoTokenizer
import time from toolbox import get_conf, ProxyNetworkActivate
import threading from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
import importlib
from toolbox import update_ui, get_conf, ProxyNetworkActivate
from multiprocessing import Process, Pipe
load_message = "ChatGLM尚未加载加载需要一段时间。注意取决于`config.py`的配置ChatGLM消耗大量的内存CPU或显存GPU也许会导致低配计算机卡死 ……"
#################################################################################
class GetGLMHandle(Process):
def __init__(self):
super().__init__(daemon=True)
self.parent, self.child = Pipe()
self.chatglm_model = None
self.chatglm_tokenizer = None
self.info = ""
self.success = True
self.check_dependency()
self.start()
self.threadLock = threading.Lock()
def check_dependency(self):
try:
import sentencepiece
self.info = "依赖检测通过"
self.success = True
except:
self.info = "缺少ChatGLM的依赖如果要使用ChatGLM除了基础的pip依赖以外您还需要运行`pip install -r request_llms/requirements_chatglm.txt`安装ChatGLM的依赖。"
self.success = False
def ready(self): # ------------------------------------------------------------------------------------------------------------------------
return self.chatglm_model is not None # 🔌💻 Local Model
# ------------------------------------------------------------------------------------------------------------------------
@SingletonLocalLLM
class GetGLM2Handle(LocalLLMHandle):
def run(self): def load_model_info(self):
# 子进程执行 # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
# 第一次运行,加载参数 self.model_name = model_name
retry = 0 self.cmd_to_install = cmd_to_install
def load_model_and_tokenizer(self):
# 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
import os, glob
import os
import platform
LOCAL_MODEL_QUANT, device = get_conf('LOCAL_MODEL_QUANT', 'LOCAL_MODEL_DEVICE') LOCAL_MODEL_QUANT, device = get_conf('LOCAL_MODEL_QUANT', 'LOCAL_MODEL_DEVICE')
if LOCAL_MODEL_QUANT == "INT4": # INT4 if LOCAL_MODEL_QUANT == "INT4": # INT4
@ -46,122 +33,47 @@ class GetGLMHandle(Process):
else: else:
_model_name_ = "THUDM/chatglm2-6b" # FP16 _model_name_ = "THUDM/chatglm2-6b" # FP16
while True: with ProxyNetworkActivate('Download_LLM'):
try: chatglm_tokenizer = AutoTokenizer.from_pretrained(_model_name_, trust_remote_code=True)
with ProxyNetworkActivate('Download_LLM'): if device=='cpu':
if self.chatglm_model is None: chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).float()
self.chatglm_tokenizer = AutoTokenizer.from_pretrained(_model_name_, trust_remote_code=True)
if device=='cpu':
self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).float()
else:
self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).half().cuda()
self.chatglm_model = self.chatglm_model.eval()
break
else:
break
except:
retry += 1
if retry > 3:
self.child.send('[Local Message] Call ChatGLM fail 不能正常加载ChatGLM的参数。')
raise RuntimeError("不能正常加载ChatGLM的参数")
while True:
# 进入任务等待状态
kwargs = self.child.recv()
# 收到消息,开始请求
try:
for response, history in self.chatglm_model.stream_chat(self.chatglm_tokenizer, **kwargs):
self.child.send(response)
# # 中途接收可能的终止指令(如果有的话)
# if self.child.poll():
# command = self.child.recv()
# if command == '[Terminate]': break
except:
from toolbox import trimmed_format_exc
self.child.send('[Local Message] Call ChatGLM fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
# 请求处理结束,开始下一个循环
self.child.send('[Finish]')
def stream_chat(self, **kwargs):
# 主进程执行
self.threadLock.acquire()
self.parent.send(kwargs)
while True:
res = self.parent.recv()
if res != '[Finish]':
yield res
else: else:
break chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).half().cuda()
self.threadLock.release() chatglm_model = chatglm_model.eval()
global glm_handle
glm_handle = None
#################################################################################
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
"""
多线程方法
函数的说明请见 request_llms/bridge_all.py
"""
global glm_handle
if glm_handle is None:
glm_handle = GetGLMHandle()
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glm_handle.info
if not glm_handle.success:
error = glm_handle.info
glm_handle = None
raise RuntimeError(error)
# chatglm 没有 sys_prompt 接口因此把prompt加入 history self._model = chatglm_model
history_feedin = [] self._tokenizer = chatglm_tokenizer
history_feedin.append(["What can I do?", sys_prompt]) return self._model, self._tokenizer
for i in range(len(history)//2):
history_feedin.append([history[2*i], history[2*i+1]] )
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可 def llm_stream_generator(self, **kwargs):
response = "" # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
for response in glm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): def adaptor(kwargs):
if len(observe_window) >= 1: observe_window[0] = response query = kwargs['query']
if len(observe_window) >= 2: max_length = kwargs['max_length']
if (time.time()-observe_window[1]) > watch_dog_patience: top_p = kwargs['top_p']
raise RuntimeError("程序终止。") temperature = kwargs['temperature']
return response history = kwargs['history']
return query, max_length, top_p, temperature, history
query, max_length, top_p, temperature, history = adaptor(kwargs)
for response, history in self._model.stream_chat(self._tokenizer,
query,
history,
max_length=max_length,
top_p=top_p,
temperature=temperature,
):
yield response
def try_to_import_special_deps(self, **kwargs):
# import something that will raise error if the user does not install requirement_*.txt
# 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行
import importlib
# importlib.import_module('modelscope')
# ------------------------------------------------------------------------------------------------------------------------
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None): # 🔌💻 GPT-Academic Interface
""" # ------------------------------------------------------------------------------------------------------------------------
单线程方法 predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetGLM2Handle, model_name)
函数的说明请见 request_llms/bridge_all.py
"""
chatbot.append((inputs, ""))
global glm_handle
if glm_handle is None:
glm_handle = GetGLMHandle()
chatbot[-1] = (inputs, load_message + "\n\n" + glm_handle.info)
yield from update_ui(chatbot=chatbot, history=[])
if not glm_handle.success:
glm_handle = None
return
if additional_fn is not None:
from core_functional import handle_core_functionality
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
# 处理历史信息
history_feedin = []
history_feedin.append(["What can I do?", system_prompt] )
for i in range(len(history)//2):
history_feedin.append([history[2*i], history[2*i+1]] )
# 开始接收chatglm的回复
response = "[Local Message] 等待ChatGLM响应中 ..."
for response in glm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
chatbot[-1] = (inputs, response)
yield from update_ui(chatbot=chatbot, history=history)
# 总结输出
if response == "[Local Message] 等待ChatGLM响应中 ...":
response = "[Local Message] ChatGLM响应异常 ..."
history.extend([inputs, response])
yield from update_ui(chatbot=chatbot, history=history)

View File

@ -12,7 +12,7 @@ from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, Singleto
# 🔌💻 Local Model # 🔌💻 Local Model
# ------------------------------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------------------------------
@SingletonLocalLLM @SingletonLocalLLM
class GetONNXGLMHandle(LocalLLMHandle): class GetGLM3Handle(LocalLLMHandle):
def load_model_info(self): def load_model_info(self):
# 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
@ -69,10 +69,10 @@ class GetONNXGLMHandle(LocalLLMHandle):
# import something that will raise error if the user does not install requirement_*.txt # import something that will raise error if the user does not install requirement_*.txt
# 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行 # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行
import importlib import importlib
importlib.import_module('modelscope') # importlib.import_module('modelscope')
# ------------------------------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------------------------------
# 🔌💻 GPT-Academic Interface # 🔌💻 GPT-Academic Interface
# ------------------------------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------------------------------
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name, history_format='chatglm3') predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetGLM3Handle, model_name, history_format='chatglm3')

View File

@ -1,15 +1,16 @@
from transformers import AutoModel, AutoTokenizer
import time import time
import threading import threading
import importlib from toolbox import update_ui
from toolbox import update_ui, get_conf, Singleton
from multiprocessing import Process, Pipe from multiprocessing import Process, Pipe
from contextlib import redirect_stdout
def SingletonLocalLLM(cls): def SingletonLocalLLM(cls):
""" """
一个单实例装饰器 一个单实例装饰器
""" """
_instance = {} _instance = {}
def _singleton(*args, **kargs): def _singleton(*args, **kargs):
if cls not in _instance: if cls not in _instance:
_instance[cls] = cls(*args, **kargs) _instance[cls] = cls(*args, **kargs)
@ -21,6 +22,28 @@ def SingletonLocalLLM(cls):
return _instance[cls] return _instance[cls]
return _singleton return _singleton
def reset_tqdm_output():
import sys, tqdm
def status_printer(self, file):
fp = file
if fp in (sys.stderr, sys.stdout):
getattr(sys.stderr, 'flush', lambda: None)()
getattr(sys.stdout, 'flush', lambda: None)()
def fp_write(s):
print(s)
last_len = [0]
def print_status(s):
from tqdm.utils import disp_len
len_s = disp_len(s)
fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
last_len[0] = len_s
return print_status
tqdm.tqdm.status_printer = status_printer
class LocalLLMHandle(Process): class LocalLLMHandle(Process):
def __init__(self): def __init__(self):
# ⭐主进程执行 # ⭐主进程执行
@ -28,6 +51,9 @@ class LocalLLMHandle(Process):
self.corrupted = False self.corrupted = False
self.load_model_info() self.load_model_info()
self.parent, self.child = Pipe() self.parent, self.child = Pipe()
# allow redirect_stdout
self.std_tag = "[Subprocess Message] "
self.child.write = lambda x: self.child.send(self.std_tag + x)
self.running = True self.running = True
self._model = None self._model = None
self._tokenizer = None self._tokenizer = None
@ -52,7 +78,7 @@ class LocalLLMHandle(Process):
def llm_stream_generator(self, **kwargs): def llm_stream_generator(self, **kwargs):
# 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
raise NotImplementedError("Method not implemented yet") raise NotImplementedError("Method not implemented yet")
def try_to_import_special_deps(self, **kwargs): def try_to_import_special_deps(self, **kwargs):
""" """
import something that will raise error if the user does not install requirement_*.txt import something that will raise error if the user does not install requirement_*.txt
@ -64,7 +90,7 @@ class LocalLLMHandle(Process):
# ⭐主进程执行 # ⭐主进程执行
try: try:
self.try_to_import_special_deps() self.try_to_import_special_deps()
self.info = "依赖检测通过" self.info = "`依赖检测通过`"
self.running = True self.running = True
except: except:
self.info = f"缺少{self.model_name}的依赖,如果要使用{self.model_name}除了基础的pip依赖以外您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。" self.info = f"缺少{self.model_name}的依赖,如果要使用{self.model_name}除了基础的pip依赖以外您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
@ -73,15 +99,21 @@ class LocalLLMHandle(Process):
def run(self): def run(self):
# 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
# 第一次运行,加载参数 # 第一次运行,加载参数
reset_tqdm_output()
self.info = "`尝试加载模型`"
try: try:
self._model, self._tokenizer = self.load_model_and_tokenizer() with redirect_stdout(self.child):
self._model, self._tokenizer = self.load_model_and_tokenizer()
except: except:
self.info = "`加载模型失败`"
self.running = False self.running = False
from toolbox import trimmed_format_exc from toolbox import trimmed_format_exc
self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n') self.child.send(
f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
self.child.send('[FinishBad]') self.child.send('[FinishBad]')
raise RuntimeError(f"不能正常加载{self.model_name}的参数!") raise RuntimeError(f"不能正常加载{self.model_name}的参数!")
self.info = "`准备就绪`"
while True: while True:
# 进入任务等待状态 # 进入任务等待状态
kwargs = self.child.recv() kwargs = self.child.recv()
@ -93,25 +125,35 @@ class LocalLLMHandle(Process):
# 请求处理结束,开始下一个循环 # 请求处理结束,开始下一个循环
except: except:
from toolbox import trimmed_format_exc from toolbox import trimmed_format_exc
self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n') self.child.send(
f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
self.child.send('[Finish]') self.child.send('[Finish]')
def stream_chat(self, **kwargs): def stream_chat(self, **kwargs):
# ⭐主进程执行 # ⭐主进程执行
self.threadLock.acquire() if self.info == "`准备就绪`":
self.parent.send(kwargs) yield "`正在等待线程锁,排队中请稍后 ...`"
while True: with self.threadLock:
res = self.parent.recv() self.parent.send(kwargs)
if res == '[Finish]': std_out = ""
break std_out_clip_len = 4096
if res == '[FinishBad]': while True:
self.running = False res = self.parent.recv()
self.corrupted = True if res.startswith(self.std_tag):
break new_output = res[len(self.std_tag):]
else: std_out = std_out[:std_out_clip_len]
yield res print(new_output, end='')
self.threadLock.release() std_out = new_output + std_out
yield self.std_tag + '\n```\n' + std_out + '\n```\n'
elif res == '[Finish]':
break
elif res == '[FinishBad]':
self.running = False
self.corrupted = True
break
else:
std_out = ""
yield res
def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='classic'): def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='classic'):
@ -123,15 +165,17 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
函数的说明请见 request_llms/bridge_all.py 函数的说明请见 request_llms/bridge_all.py
""" """
_llm_handle = LLMSingletonClass() _llm_handle = LLMSingletonClass()
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info if len(observe_window) >= 1:
if not _llm_handle.running: raise RuntimeError(_llm_handle.info) observe_window[0] = load_message + "\n\n" + _llm_handle.info
if not _llm_handle.running:
raise RuntimeError(_llm_handle.info)
if history_format == 'classic': if history_format == 'classic':
# 没有 sys_prompt 接口因此把prompt加入 history # 没有 sys_prompt 接口因此把prompt加入 history
history_feedin = [] history_feedin = []
history_feedin.append([sys_prompt, "Certainly!"]) history_feedin.append([sys_prompt, "Certainly!"])
for i in range(len(history)//2): for i in range(len(history)//2):
history_feedin.append([history[2*i], history[2*i+1]] ) history_feedin.append([history[2*i], history[2*i+1]])
elif history_format == 'chatglm3': elif history_format == 'chatglm3':
# 有 sys_prompt 接口 # 有 sys_prompt 接口
conversation_cnt = len(history) // 2 conversation_cnt = len(history) // 2
@ -145,24 +189,24 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
what_gpt_answer["role"] = "assistant" what_gpt_answer["role"] = "assistant"
what_gpt_answer["content"] = history[index+1] what_gpt_answer["content"] = history[index+1]
if what_i_have_asked["content"] != "": if what_i_have_asked["content"] != "":
if what_gpt_answer["content"] == "": continue if what_gpt_answer["content"] == "":
continue
history_feedin.append(what_i_have_asked) history_feedin.append(what_i_have_asked)
history_feedin.append(what_gpt_answer) history_feedin.append(what_gpt_answer)
else: else:
history_feedin[-1]['content'] = what_gpt_answer['content'] history_feedin[-1]['content'] = what_gpt_answer['content']
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可 watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
response = "" response = ""
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']): for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
if len(observe_window) >= 1: if len(observe_window) >= 1:
observe_window[0] = response observe_window[0] = response
if len(observe_window) >= 2: if len(observe_window) >= 2:
if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。") if (time.time()-observe_window[1]) > watch_dog_patience:
raise RuntimeError("程序终止。")
return response return response
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream=True, additional_fn=None):
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
""" """
单线程方法 单线程方法
函数的说明请见 request_llms/bridge_all.py 函数的说明请见 request_llms/bridge_all.py
@ -172,11 +216,13 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
_llm_handle = LLMSingletonClass() _llm_handle = LLMSingletonClass()
chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info) chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
yield from update_ui(chatbot=chatbot, history=[]) yield from update_ui(chatbot=chatbot, history=[])
if not _llm_handle.running: raise RuntimeError(_llm_handle.info) if not _llm_handle.running:
raise RuntimeError(_llm_handle.info)
if additional_fn is not None: if additional_fn is not None:
from core_functional import handle_core_functionality from core_functional import handle_core_functionality
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot) inputs, history = handle_core_functionality(
additional_fn, inputs, history, chatbot)
# 处理历史信息 # 处理历史信息
if history_format == 'classic': if history_format == 'classic':
@ -184,7 +230,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
history_feedin = [] history_feedin = []
history_feedin.append([system_prompt, "Certainly!"]) history_feedin.append([system_prompt, "Certainly!"])
for i in range(len(history)//2): for i in range(len(history)//2):
history_feedin.append([history[2*i], history[2*i+1]] ) history_feedin.append([history[2*i], history[2*i+1]])
elif history_format == 'chatglm3': elif history_format == 'chatglm3':
# 有 sys_prompt 接口 # 有 sys_prompt 接口
conversation_cnt = len(history) // 2 conversation_cnt = len(history) // 2
@ -198,7 +244,8 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
what_gpt_answer["role"] = "assistant" what_gpt_answer["role"] = "assistant"
what_gpt_answer["content"] = history[index+1] what_gpt_answer["content"] = history[index+1]
if what_i_have_asked["content"] != "": if what_i_have_asked["content"] != "":
if what_gpt_answer["content"] == "": continue if what_gpt_answer["content"] == "":
continue
history_feedin.append(what_i_have_asked) history_feedin.append(what_i_have_asked)
history_feedin.append(what_gpt_answer) history_feedin.append(what_gpt_answer)
else: else:
@ -216,4 +263,4 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
history.extend([inputs, response]) history.extend([inputs, response])
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)
return predict_no_ui_long_connection, predict return predict_no_ui_long_connection, predict