access vllm

2024-04-11 22:00:07 +08:00 · 2024-04-11 22:00:07 +08:00 · 2406022c2a
commit 2406022c2a
parent 02b6f26b05
3 changed files with 76 additions and 2 deletions
--- a/docs/use_vllm.md
+++ b/docs/use_vllm.md
@ -0,0 +1,46 @@
 # 使用VLLM
 ## 1. 首先启动 VLLM，自行选择模型
 ```
 python -m vllm.entrypoints.openai.api_server --model /home/hmp/llm/cache/Qwen1___5-32B-Chat --tensor-parallel-size 2 --dtype=half
 ```
 这里使用了存储在 `/home/hmp/llm/cache/Qwen1___5-32B-Chat` 的本地模型，可以根据自己的需求更改。
 ## 2. 测试 VLLM
 ```
 curl http://localhost:8000/v1/chat/completions \
 -H "Content-Type: application/json" \
 -d '{
  "model": "/home/hmp/llm/cache/Qwen1___5-32B-Chat",
  "messages": [
  {"role": "system", "content": "You are a helpful assistant."},
  {"role": "user", "content": "怎么实现一个去中心化的控制器?"}
  ]
 }'
 ```
 ## 3. 配置本项目
 ```
 API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"
 LLM_MODEL = "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
 API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "http://localhost:8000/v1/chat/completions"}
 ```
 ```
 "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
 其中
  "vllm-"                                     是前缀（必要）
  "/home/hmp/llm/cache/Qwen1___5-32B-Chat"    是模型名（必要）
  "(max_token=6666)"                          是配置（非必要）
 ```
 ## 4. 启动！
 ```
 python main.py
 ```
--- a/request_llms/bridge_all.py
+++ b/request_llms/bridge_all.py
@ -784,6 +784,29 @@ for model in [m for m in AVAIL_LLM_MODELS if m.startswith("one-api-")]:
            "token_cnt": get_token_num_gpt35,
        },
    })
 # -=-=-=-=-=-=- vllm 对齐支持 -=-=-=-=-=-=-
 for model in [m for m in AVAIL_LLM_MODELS if m.startswith("vllm-")]:
    # 为了更灵活地接入vllm多模型管理界面，设计了此接口，例子：AVAIL_LLM_MODELS = ["vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=6666)"]
    # 其中
    #   "vllm-"             是前缀（必要）
    #   "mixtral-8x7b"      是模型名（必要）
    #   "(max_token=6666)"  是配置（非必要）
    try:
        _, max_token_tmp = read_one_api_model_name(model)
    except:
        print(f"vllm模型 {model} 的 max_token 配置不是整数，请检查配置文件。")
        continue
    model_info.update({
        model: {
            "fn_with_ui": chatgpt_ui,
            "fn_without_ui": chatgpt_noui,
            "can_multi_thread": True,
            "endpoint": openai_endpoint,
            "max_token": max_token_tmp,
            "tokenizer": tokenizer_gpt35,
            "token_cnt": get_token_num_gpt35,
        },
    })
 # -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=-
--- a/request_llms/bridge_chatgpt.py
+++ b/request_llms/bridge_chatgpt.py
@ -323,7 +323,10 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
    if not is_any_api_key(llm_kwargs['api_key']):
        raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案：直接在输入区键入api_key，然后回车提交。\n\n2. 长效解决方案：在config.py中配置。")
-    api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
+    if llm_kwargs['llm_model'].startswith('vllm-'):
        api_key = 'no-api-key'
    else:
        api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
    headers = {
        "Content-Type": "application/json",
@ -365,7 +368,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
    if llm_kwargs['llm_model'].startswith('one-api-'):
        model = llm_kwargs['llm_model'][len('one-api-'):]
        model, _ = read_one_api_model_name(model)
-
+    if llm_kwargs['llm_model'].startswith('vllm-'):
        model = llm_kwargs['llm_model'][len('vllm-'):]
        model, _ = read_one_api_model_name(model)
    if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制
        model = random.choice([
            "gpt-3.5-turbo",