Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c53e0730
Unverified
Commit
c53e0730
authored
Apr 25, 2025
by
Rui Qiao
Committed by
GitHub
Apr 25, 2025
Browse files
[Misc] Refine ray_serve_deepseek example (#17204)
Signed-off-by:
Rui Qiao
<
ruisearch42@gmail.com
>
parent
a0e619e6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
25 deletions
+29
-25
examples/online_serving/ray_serve_deepseek.py
examples/online_serving/ray_serve_deepseek.py
+29
-25
No files found.
examples/online_serving/ray_serve_deepseek.py
View file @
c53e0730
...
@@ -8,37 +8,41 @@ Run `python3 ray_serve_deepseek.py` to deploy the model.
...
@@ -8,37 +8,41 @@ Run `python3 ray_serve_deepseek.py` to deploy the model.
"""
"""
from
ray
import
serve
from
ray
import
serve
from
ray.serve.llm
import
LLMConfig
,
LLMRouter
,
LLMServer
from
ray.serve.llm
import
LLMConfig
,
build_openai_app
llm_config
=
LLMConfig
(
llm_config
=
LLMConfig
(
model_loading_config
=
dict
(
model_loading_config
=
{
model_id
=
"deepseek"
,
"model_id"
:
"deepseek"
,
# Change to model download path
# Since DeepSeek model is huge, it is recommended to pre-download
model_source
=
"/path/to/the/model"
,
# the model to local disk, say /path/to/the/model and specify:
),
# model_source="/path/to/the/model"
deployment_config
=
dict
(
autoscaling_config
=
dict
(
"model_source"
:
"deepseek-ai/DeepSeek-R1"
,
min_replicas
=
1
,
},
max_replicas
=
1
,
deployment_config
=
{
)),
"autoscaling_config"
:
{
"min_replicas"
:
1
,
"max_replicas"
:
1
,
}
},
# Change to the accelerator type of the node
# Change to the accelerator type of the node
accelerator_type
=
"H100"
,
accelerator_type
=
"H100"
,
runtime_env
=
dict
(
env_vars
=
dict
(
VLLM_USE_V1
=
"1"
)),
runtime_env
=
{
"env_vars"
:
{
"VLLM_USE_V1"
:
"1"
}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
engine_kwargs
=
dict
(
engine_kwargs
=
{
tensor_parallel_size
=
8
,
"
tensor_parallel_size
"
:
8
,
pipeline_parallel_size
=
2
,
"
pipeline_parallel_size
"
:
2
,
gpu_memory_utilization
=
0.92
,
"
gpu_memory_utilization
"
:
0.92
,
dtype
=
"auto"
,
"
dtype
"
:
"auto"
,
max_num_seqs
=
40
,
"
max_num_seqs
"
:
40
,
max_model_len
=
16384
,
"
max_model_len
"
:
16384
,
enable_chunked_prefill
=
True
,
"
enable_chunked_prefill
"
:
True
,
enable_prefix_caching
=
True
,
"
enable_prefix_caching
"
:
True
,
trust_remote_code
=
True
,
"
trust_remote_code
"
:
True
,
)
,
}
,
)
)
# Deploy the application
# Deploy the application
deployment
=
LLMServer
.
as_deployment
(
llm_app
=
build_openai_app
({
"llm_configs"
:
[
llm_config
]})
llm_config
.
get_serve_options
(
name_prefix
=
"vLLM:"
)).
bind
(
llm_config
)
llm_app
=
LLMRouter
.
as_deployment
().
bind
([
deployment
])
serve
.
run
(
llm_app
)
serve
.
run
(
llm_app
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment