access vllm
This commit is contained in:
parent
02b6f26b05
commit
2406022c2a
46
docs/use_vllm.md
Normal file
46
docs/use_vllm.md
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# 使用VLLM
|
||||||
|
|
||||||
|
|
||||||
|
## 1. 首先启动 VLLM,自行选择模型
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m vllm.entrypoints.openai.api_server --model /home/hmp/llm/cache/Qwen1___5-32B-Chat --tensor-parallel-size 2 --dtype=half
|
||||||
|
```
|
||||||
|
|
||||||
|
这里使用了存储在 `/home/hmp/llm/cache/Qwen1___5-32B-Chat` 的本地模型,可以根据自己的需求更改。
|
||||||
|
|
||||||
|
## 2. 测试 VLLM
|
||||||
|
|
||||||
|
```
|
||||||
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "/home/hmp/llm/cache/Qwen1___5-32B-Chat",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "怎么实现一个去中心化的控制器?"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. 配置本项目
|
||||||
|
|
||||||
|
```
|
||||||
|
API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"
|
||||||
|
LLM_MODEL = "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
|
||||||
|
API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "http://localhost:8000/v1/chat/completions"}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
"vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
|
||||||
|
其中
|
||||||
|
"vllm-" 是前缀(必要)
|
||||||
|
"/home/hmp/llm/cache/Qwen1___5-32B-Chat" 是模型名(必要)
|
||||||
|
"(max_token=6666)" 是配置(非必要)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. 启动!
|
||||||
|
|
||||||
|
```
|
||||||
|
python main.py
|
||||||
|
```
|
@ -784,6 +784,29 @@ for model in [m for m in AVAIL_LLM_MODELS if m.startswith("one-api-")]:
|
|||||||
"token_cnt": get_token_num_gpt35,
|
"token_cnt": get_token_num_gpt35,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
# -=-=-=-=-=-=- vllm 对齐支持 -=-=-=-=-=-=-
|
||||||
|
for model in [m for m in AVAIL_LLM_MODELS if m.startswith("vllm-")]:
|
||||||
|
# 为了更灵活地接入vllm多模型管理界面,设计了此接口,例子:AVAIL_LLM_MODELS = ["vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=6666)"]
|
||||||
|
# 其中
|
||||||
|
# "vllm-" 是前缀(必要)
|
||||||
|
# "mixtral-8x7b" 是模型名(必要)
|
||||||
|
# "(max_token=6666)" 是配置(非必要)
|
||||||
|
try:
|
||||||
|
_, max_token_tmp = read_one_api_model_name(model)
|
||||||
|
except:
|
||||||
|
print(f"vllm模型 {model} 的 max_token 配置不是整数,请检查配置文件。")
|
||||||
|
continue
|
||||||
|
model_info.update({
|
||||||
|
model: {
|
||||||
|
"fn_with_ui": chatgpt_ui,
|
||||||
|
"fn_without_ui": chatgpt_noui,
|
||||||
|
"can_multi_thread": True,
|
||||||
|
"endpoint": openai_endpoint,
|
||||||
|
"max_token": max_token_tmp,
|
||||||
|
"tokenizer": tokenizer_gpt35,
|
||||||
|
"token_cnt": get_token_num_gpt35,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
# -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=-
|
# -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=-
|
||||||
|
@ -323,7 +323,10 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
|||||||
if not is_any_api_key(llm_kwargs['api_key']):
|
if not is_any_api_key(llm_kwargs['api_key']):
|
||||||
raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。")
|
raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。")
|
||||||
|
|
||||||
api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
|
if llm_kwargs['llm_model'].startswith('vllm-'):
|
||||||
|
api_key = 'no-api-key'
|
||||||
|
else:
|
||||||
|
api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
@ -365,7 +368,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
|||||||
if llm_kwargs['llm_model'].startswith('one-api-'):
|
if llm_kwargs['llm_model'].startswith('one-api-'):
|
||||||
model = llm_kwargs['llm_model'][len('one-api-'):]
|
model = llm_kwargs['llm_model'][len('one-api-'):]
|
||||||
model, _ = read_one_api_model_name(model)
|
model, _ = read_one_api_model_name(model)
|
||||||
|
if llm_kwargs['llm_model'].startswith('vllm-'):
|
||||||
|
model = llm_kwargs['llm_model'][len('vllm-'):]
|
||||||
|
model, _ = read_one_api_model_name(model)
|
||||||
if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制
|
if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制
|
||||||
model = random.choice([
|
model = random.choice([
|
||||||
"gpt-3.5-turbo",
|
"gpt-3.5-turbo",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user