From c17fc2a9b55b1c7447718a06a3eac4378828bb22 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Mon, 7 Aug 2023 01:58:35 +0800
Subject: [PATCH] =?UTF-8?q?=E6=88=91=E6=98=AF=E6=9D=A5=E8=87=AA=E8=BE=BE?=
 =?UTF-8?q?=E6=91=A9=E9=99=A2=E7=9A=84=E5=A4=A7=E8=A7=84=E6=A8=A1=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=EF=BC=8C=E6=88=91=E5=8F=AB=E9=80=9A?=
 =?UTF-8?q?=E4=B9=89=E5=8D=83=E9=97=AE=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                  |  2 +-
 request_llm/bridge_all.py  | 16 ++++++++++++++++
 request_llm/bridge_qwen.py | 13 +++++--------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/config.py b/config.py
index 1d43dd4..bfa4a3a 100644
--- a/config.py
+++ b/config.py
@@ -71,7 +71,7 @@ MAX_RETRY = 2
 # 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
 LLM_MODEL = "gpt-3.5-turbo" # 可选 ↓↓↓
 AVAIL_LLM_MODELS = ["gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss", "newbing", "stack-claude"]
-# P.S. 其他可用的模型还包括 ["gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
+# P.S. 其他可用的模型还包括 ["qwen", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"]
 
 
 # ChatGLM(2) Finetune Model Path （如果使用ChatGLM2微调模型，需要把"chatglmft"加入AVAIL_LLM_MODELS中）
diff --git a/request_llm/bridge_all.py b/request_llm/bridge_all.py
index f38711d..1f8a1dc 100644
--- a/request_llm/bridge_all.py
+++ b/request_llm/bridge_all.py
@@ -335,6 +335,22 @@ if "chatglm_onnx" in AVAIL_LLM_MODELS:
         })
     except:
         print(trimmed_format_exc())
+if "qwen" in AVAIL_LLM_MODELS:
+    try:
+        from .bridge_qwen import predict_no_ui_long_connection as qwen_noui
+        from .bridge_qwen import predict as qwen_ui
+        model_info.update({
+            "qwen": {
+                "fn_with_ui": qwen_ui,
+                "fn_without_ui": qwen_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
 
 def LLM_CATCH_EXCEPTION(f):
     """
diff --git a/request_llm/bridge_qwen.py b/request_llm/bridge_qwen.py
index 3ca36ab..cd437e4 100644
--- a/request_llm/bridge_qwen.py
+++ b/request_llm/bridge_qwen.py
@@ -32,15 +32,13 @@ class GetONNXGLMHandle(LocalLLMHandle):
 
         model_id = 'qwen/Qwen-7B-Chat'
         revision = 'v1.0.1'
-        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
+        self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
         # use fp16
-        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, 
-                                                    trust_remote_code=True, fp16=True).eval()
-        model.generation_config = GenerationConfig.from_pretrained(model_id,
-                                                                trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, trust_remote_code=True, fp16=True).eval()
+        model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
         self._model = model
 
-        return self._model, None
+        return self._model, self._tokenizer
 
     def llm_stream_generator(self, **kwargs):
         # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
@@ -54,8 +52,7 @@ class GetONNXGLMHandle(LocalLLMHandle):
 
         query, max_length, top_p, temperature, history = adaptor(kwargs)
 
-        prompt = chat_template(history, query)
-        for response in model.chat(tokenizer, query, history=history, stream=True):
+        for response in self._model.chat(self._tokenizer, query, history=history, stream=True):
             yield response
         
     def try_to_import_special_deps(self, **kwargs):