access vllm

2024-04-11 22:00:07 +08:00 · 2024-04-11 22:00:07 +08:00 · 2406022c2a
commit 2406022c2a
parent 02b6f26b05
3 changed files with 76 additions and 2 deletions
--- a/docs/use_vllm.md
+++ b/docs/use_vllm.md
@ -0,0 +1,46 @@
+# 使用VLLM
+
+
+## 1. 首先启动 VLLM，自行选择模型
+
+```
+python -m vllm.entrypoints.openai.api_server --model /home/hmp/llm/cache/Qwen1___5-32B-Chat --tensor-parallel-size 2 --dtype=half
+```
+
+这里使用了存储在 `/home/hmp/llm/cache/Qwen1___5-32B-Chat` 的本地模型，可以根据自己的需求更改。
+
+## 2. 测试 VLLM
+
+```
+curl http://localhost:8000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "/home/hmp/llm/cache/Qwen1___5-32B-Chat",
+  "messages": [
+  {"role": "system", "content": "You are a helpful assistant."},
+  {"role": "user", "content": "怎么实现一个去中心化的控制器?"}
+  ]
+}'
+```
+
+## 3. 配置本项目
+
+```
+API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"
+LLM_MODEL = "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
+API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "http://localhost:8000/v1/chat/completions"}
+```
+
+```
+"vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
+其中
+  "vllm-"                                     是前缀（必要）
+  "/home/hmp/llm/cache/Qwen1___5-32B-Chat"    是模型名（必要）
+  "(max_token=6666)"                          是配置（非必要）
+```
+
+## 4. 启动！
+
+```
+python main.py
+```
--- a/request_llms/bridge_all.py
+++ b/request_llms/bridge_all.py
@ -784,6 +784,29 @@ for model in [m for m in AVAIL_LLM_MODELS if m.startswith("one-api-")]:
            "token_cnt": get_token_num_gpt35,
        },
    })
+# -=-=-=-=-=-=- vllm 对齐支持 -=-=-=-=-=-=-
+for model in [m for m in AVAIL_LLM_MODELS if m.startswith("vllm-")]:
+    # 为了更灵活地接入vllm多模型管理界面，设计了此接口，例子：AVAIL_LLM_MODELS = ["vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=6666)"]
+    # 其中
+    #   "vllm-"             是前缀（必要）
+    #   "mixtral-8x7b"      是模型名（必要）
+    #   "(max_token=6666)"  是配置（非必要）
+    try:
+        _, max_token_tmp = read_one_api_model_name(model)
+    except:
+        print(f"vllm模型 {model} 的 max_token 配置不是整数，请检查配置文件。")
+        continue
+    model_info.update({
+        model: {
+            "fn_with_ui": chatgpt_ui,
+            "fn_without_ui": chatgpt_noui,
+            "can_multi_thread": True,
+            "endpoint": openai_endpoint,
+            "max_token": max_token_tmp,
+            "tokenizer": tokenizer_gpt35,
+            "token_cnt": get_token_num_gpt35,
+        },
+    })


 # -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=-
--- a/request_llms/bridge_chatgpt.py
+++ b/request_llms/bridge_chatgpt.py
@ -323,7 +323,10 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
    if not is_any_api_key(llm_kwargs['api_key']):
        raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案：直接在输入区键入api_key，然后回车提交。\n\n2. 长效解决方案：在config.py中配置。")

-    api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
+    if llm_kwargs['llm_model'].startswith('vllm-'):
+        api_key = 'no-api-key'
+    else:
+        api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])

    headers = {
        "Content-Type": "application/json",
@ -365,7 +368,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
    if llm_kwargs['llm_model'].startswith('one-api-'):
        model = llm_kwargs['llm_model'][len('one-api-'):]
        model, _ = read_one_api_model_name(model)
-
+    if llm_kwargs['llm_model'].startswith('vllm-'):
+        model = llm_kwargs['llm_model'][len('vllm-'):]
+        model, _ = read_one_api_model_name(model)
    if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制
        model = random.choice([
            "gpt-3.5-turbo",