Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
efe82b86
Commit
efe82b86
authored
Mar 09, 2025
by
Alec
Committed by
GitHub
Mar 09, 2025
Browse files
feat: make vllm baseline support both chat and completions (#70)
parent
5944dbed
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
212 additions
and
41 deletions
+212
-41
examples/python_rs/llm/vllm/README.md
examples/python_rs/llm/vllm/README.md
+2
-1
examples/python_rs/llm/vllm/monolith/worker.py
examples/python_rs/llm/vllm/monolith/worker.py
+104
-40
examples/python_rs/llm/vllm/scripts/baseline.sh
examples/python_rs/llm/vllm/scripts/baseline.sh
+106
-0
No files found.
examples/python_rs/llm/vllm/README.md
View file @
efe82b86
...
@@ -63,7 +63,8 @@ By default the server will run on port 8080.
...
@@ -63,7 +63,8 @@ By default the server will run on port 8080.
Add model to the server:
Add model to the server:
```
bash
```
bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.generate
llmctl http add chat deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.chat/completions
llmctl http add completions deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.completions
```
```
##### Example Output
##### Example Output
...
...
examples/python_rs/llm/vllm/monolith/worker.py
View file @
efe82b86
...
@@ -13,62 +13,119 @@
...
@@ -13,62 +13,119 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
asyncio
import
asyncio
import
uuid
import
json
from
typing
import
AsyncGenerator
,
AsyncIterator
import
uvloop
import
uvloop
from
common.base_engine
import
BaseVllmEngine
from
common.chat_processor
import
ProcessMixIn
from
common.parser
import
parse_vllm_args
from
common.parser
import
parse_vllm_args
from
vllm.config
import
ModelConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.entrypoints.openai.protocol
import
(
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionStreamResponse
,
ChatCompletionStreamResponse
,
CompletionRequest
,
CompletionResponse
,
CompletionStreamResponse
,
ErrorResponse
,
)
)
from
vllm.logger
import
logger
as
vllm_logger
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_completion
import
OpenAIServingCompletion
from
vllm.entrypoints.openai.serving_models
import
BaseModelPath
,
OpenAIServingModels
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
class
VllmEngine
(
BaseVllmEngine
,
ProcessMixIn
):
class
VllmEngine
:
"""
def
__init__
(
Request handler for the generate endpoint
self
,
engine_client
:
AsyncIterator
[
EngineClient
],
model_config
:
ModelConfig
"""
):
self
.
engine_client
=
engine_client
self
.
model_config
=
model_config
# Ensure served_model_name matches the openai model name
# Use --served-model-name to explicitly set this or it will fallback to --model
models
=
OpenAIServingModels
(
engine_client
=
engine_client
,
model_config
=
model_config
,
base_model_paths
=
[
BaseModelPath
(
name
=
model_config
.
served_model_name
,
model_path
=
model_config
.
model
,
)
],
)
def
__init__
(
self
,
engine_args
:
AsyncEngineArgs
):
self
.
chat_serving
=
OpenAIServingChat
(
super
().
__init__
(
engine_args
)
engine_client
=
self
.
engine_client
,
model_config
=
self
.
model_config
,
models
=
models
,
response_role
=
"assistant"
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
self
.
completion_serving
=
OpenAIServingCompletion
(
engine_client
=
self
.
engine_client
,
model_config
=
self
.
model_config
,
models
=
models
,
request_logger
=
None
,
)
@
dynamo_endpoint
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
)
@
dynamo_endpoint
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
)
async
def
generate
(
self
,
raw_request
):
async
def
generate_chat
(
self
,
request
):
if
self
.
engine_client
is
None
:
result
=
await
self
.
chat_serving
.
create_chat_completion
(
request
)
await
self
.
initialize
()
if
isinstance
(
result
,
AsyncGenerator
):
vllm_logger
.
debug
(
f
"Got raw request:
{
raw_request
}
"
)
async
for
raw_response
in
result
:
(
if
raw_response
.
startswith
(
"data: [DONE]"
):
request
,
break
conversation
,
response
=
json
.
loads
(
raw_response
.
lstrip
(
"data: "
))
_
,
yield
response
engine_prompt
,
sampling_params
,
# We should always be streaming so should never get here
)
=
await
self
.
_parse_raw_request
(
raw_request
)
elif
isinstance
(
result
,
ChatCompletionResponse
):
request_id
=
str
(
uuid
.
uuid4
())
raise
RuntimeError
(
"ChatCompletionResponse support not implemented"
)
vllm_logger
.
debug
(
elif
isinstance
(
result
,
ErrorResponse
):
f
"Running generate with engine_prompt:
{
engine_prompt
}
, sampling_params:
{
sampling_params
}
, request_id:
{
request_id
}
"
error
=
result
.
dict
()
)
raise
RuntimeError
(
if
self
.
engine_client
is
None
:
f
"Error
{
error
[
'code'
]
}
:
{
error
[
'message'
]
}
"
raise
RuntimeError
(
"Engine client not initialized"
)
f
"(type:
{
error
[
'type'
]
}
, param:
{
error
[
'param'
]
}
)"
)
else
:
else
:
generator
=
self
.
engine_client
.
generate
(
raise
TypeError
(
f
"Unexpected response type:
{
type
(
result
)
}
"
)
engine_prompt
,
sampling_params
,
request_id
@
dynamo_endpoint
(
CompletionRequest
,
CompletionStreamResponse
)
async
def
generate_completions
(
self
,
request
):
result
=
await
self
.
completion_serving
.
create_completion
(
request
)
if
isinstance
(
result
,
AsyncGenerator
):
async
for
raw_response
in
result
:
if
raw_response
.
startswith
(
"data: [DONE]"
):
break
response
=
json
.
loads
(
raw_response
.
lstrip
(
"data: "
))
yield
response
# We should always be streaming so should never get here
elif
isinstance
(
result
,
CompletionResponse
):
raise
RuntimeError
(
"CompletionResponse support not implemented"
)
elif
isinstance
(
result
,
ErrorResponse
):
error
=
result
.
dict
()
raise
RuntimeError
(
f
"Error
{
error
[
'code'
]
}
:
{
error
[
'message'
]
}
"
f
"(type:
{
error
[
'type'
]
}
, param:
{
error
[
'param'
]
}
)"
)
)
async
for
response
in
await
self
.
_stream_response
(
else
:
request
,
generator
,
request_id
,
conversation
raise
TypeError
(
f
"Unexpected response type:
{
type
(
result
)
}
"
)
):
vllm_logger
.
debug
(
f
"Generated response:
{
response
}
"
)
yield
response
@
dynamo_worker
()
@
dynamo_worker
()
...
@@ -80,10 +137,17 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
...
@@ -80,10 +137,17 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"vllm"
)
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"vllm"
)
await
component
.
create_service
()
await
component
.
create_service
()
endpoint
=
component
.
endpoint
(
"generate"
)
chat_endpoint
=
component
.
endpoint
(
"chat/completions"
)
completions_endpoint
=
component
.
endpoint
(
"completions"
)
async
with
build_async_engine_client_from_engine_args
(
engine_args
)
as
engine_client
:
model_config
=
await
engine_client
.
get_model_config
()
engine
=
VllmEngine
(
engine_client
,
model_config
)
async
with
VllmEngine
(
engine_args
)
as
engine
:
await
asyncio
.
gather
(
await
endpoint
.
serve_endpoint
(
engine
.
generate
)
chat_endpoint
.
serve_endpoint
(
engine
.
generate_chat
),
completions_endpoint
.
serve_endpoint
(
engine
.
generate_completions
),
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/python_rs/llm/vllm/scripts/baseline.sh
0 → 100755
View file @
efe82b86
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# LIMITATIONS:
# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
# - Must use a single node
set
-xe
if
[
$#
-lt
2
]
;
then
echo
"Usage:
$0
<number_of_workers> <routing_strategy> <log_dir_name> [model_name] [model_args] [chat_endpoint_name] [completions_endpoint_name]"
echo
"Error: Must specify at least number of workers and routing strategy and log_dir_name"
echo
"Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo
"Optional: model_args (quoted string with model arguments)"
echo
"Optional: chat_endpoint_name (default: dynamo.vllm.chat/completions)"
echo
"Optional: completions_endpoint_name (default: dynamo.vllm.completions)"
exit
1
fi
# If using Cache, can set this to not check HF
export
HF_HUB_OFFLINE
=
1
export
GLOO_SOCKET_IFNAME
=
lo
# Required for Qwen2.5 R1 Distilled in order to set --block-size 64 and --kv-cache-dtype fp8
uv pip
install
flashinfer-python
-i
https://flashinfer.ai/whl/cu124/torch2.5/
export
VLLM_ATTENTION_BACKEND
=
FLASHINFER
NUM_WORKERS
=
$1
ROUTING_STRATEGY
=
$2
LOG_DIR_NAME
=
$3
MODEL_NAME
=
${
4
:-
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
}
CUSTOM_MODEL_ARGS
=
$5
CHAT_ENDPOINT_NAME
=
${
6
:-
"dynamo.vllm.chat/completions"
}
COMPLETIONS_ENDPOINT_NAME
=
${
7
:-
"dynamo.vllm.completions"
}
VALID_STRATEGIES
=(
"random"
)
SESSION_NAME
=
"v"
WORKDIR
=
"/workspace/examples/python_rs/llm/vllm"
INIT_CMD
=
"source /opt/dynamo/venv/bin/activate && cd
$WORKDIR
"
# Default model args
DEFAULT_MODEL_ARGS
=
"--model
$MODEL_NAME
\
--tokenizer
$MODEL_NAME
\
--enable-prefix-caching
\
--block-size 64"
# Use custom model args if provided, otherwise use default
if
[
-n
"
$CUSTOM_MODEL_ARGS
"
]
;
then
MODEL_ARGS
=
"
$CUSTOM_MODEL_ARGS
"
echo
"Using custom model arguments"
else
MODEL_ARGS
=
"
$DEFAULT_MODEL_ARGS
"
echo
"Using default model arguments"
fi
# Create logs directory if it doesn't exist
LOGS_DIR
=
"/logs/
$LOG_DIR_NAME
"
mkdir
-p
$LOGS_DIR
chmod
-R
775
$LOGS_DIR
if
[[
!
"
${
VALID_STRATEGIES
[@]
}
"
=
~
"
${
ROUTING_STRATEGY
}
"
]]
;
then
echo
"Error: Invalid routing strategy. Must be one of:
${
VALID_STRATEGIES
[*]
}
"
exit
1
fi
########################################################
# HTTP Server
########################################################
HTTP_CMD
=
"DYN_LOG=DEBUG http |& tee
$LOGS_DIR
/http.log"
tmux new-session
-d
-s
"
$SESSION_NAME
-http"
tmux send-keys
-t
"
$SESSION_NAME
-http"
"
$INIT_CMD
&&
$HTTP_CMD
"
C-m
########################################################
# LLMCTL
########################################################
LLMCTL_CMD
=
"sleep 5 &&
\
llmctl http remove chat
$MODEL_NAME
&&
\
llmctl http remove completions
$MODEL_NAME
&&
\
llmctl http add chat
$MODEL_NAME
$CHAT_ENDPOINT_NAME
&&
\
llmctl http add completions
$MODEL_NAME
$COMPLETIONS_ENDPOINT_NAME
&&
\
llmctl http list |& tee
$LOGS_DIR
/llmctl.log"
tmux new-session
-d
-s
"
$SESSION_NAME
-llmctl"
tmux send-keys
-t
"
$SESSION_NAME
-llmctl"
"
$INIT_CMD
&&
$LLMCTL_CMD
"
C-m
########################################################
# Workers
########################################################
WORKER_CMD
=
"RUST_LOG=info python3 -m monolith.worker
$MODEL_ARGS
"
for
i
in
$(
seq
1
$NUM_WORKERS
)
;
do
tmux new-session
-d
-s
"
$SESSION_NAME
-
$i
"
done
for
i
in
$(
seq
1
$NUM_WORKERS
)
;
do
tmux send-keys
-t
"
$SESSION_NAME
-
$i
"
"
$INIT_CMD
&& CUDA_VISIBLE_DEVICES=
$((
i-1
))
$WORKER_CMD
|& tee
$LOGS_DIR
/worker-
$i
.log"
C-m
done
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment