Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dd238a26
Commit
dd238a26
authored
Mar 15, 2025
by
Biswa Panda
Committed by
GitHub
Mar 15, 2025
Browse files
feat: add routerless processor based monolith example (#180)
parent
a509b8f6
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
105 additions
and
32 deletions
+105
-32
deploy/dynamo/sdk/README.md
deploy/dynamo/sdk/README.md
+0
-0
deploy/examples/llm/README.md
deploy/examples/llm/README.md
+7
-1
deploy/examples/llm/components/frontend.py
deploy/examples/llm/components/frontend.py
+0
-1
deploy/examples/llm/components/processor.py
deploy/examples/llm/components/processor.py
+33
-22
deploy/examples/llm/components/worker.py
deploy/examples/llm/components/worker.py
+8
-8
deploy/examples/llm/configs/monolith/routerless_processor_deployment.yaml
...llm/configs/monolith/routerless_processor_deployment.yaml
+37
-0
deploy/examples/llm/monolith/routerless_processor_deployment.py
.../examples/llm/monolith/routerless_processor_deployment.py
+20
-0
No files found.
deploy/dynamo/sdk/README.md
0 → 100644
View file @
dd238a26
deploy/examples/llm/README.md
View file @
dd238a26
...
...
@@ -95,6 +95,12 @@ cd /workspace/deploy/examples/llm
dynamo serve monolith.routerless_deployment:Frontend
-f
./configs/monolith/routerless_deployment.yaml
```
#### Routerless processor based monolith
```
bash
dynamo serve monolith.routerless_processor_deployment:Frontend
-f
./configs/monolith/routerless_processor_deployment.yaml
```
#### Router based disaggregated serving
```
bash
cd
/workspace/deploy/examples/llm
...
...
@@ -104,7 +110,7 @@ dynamo serve disaggregated.router_based_deployment:Frontend -f ./configs/disaggr
#### Routerless disaggregated serving
```
bash
cd
/workspace/deploy/examples/llm
dynamo serve disaggregated.routerless_deployment:Frontend
-f
./configs/disaggregated/routerles_deployment.yaml
dynamo serve disaggregated.routerless_deployment:Frontend
-f
./configs/disaggregated/routerles
s
_deployment.yaml
```
### Client
...
...
deploy/examples/llm/components/frontend.py
View file @
dd238a26
...
...
@@ -35,7 +35,6 @@ class FrontendConfig(BaseModel):
resources
=
{
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
# todo this should be called ApiServer
class
Frontend
:
worker
=
depends
(
VllmWorker
)
...
...
deploy/examples/llm/components/processor.py
View file @
dd238a26
...
...
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
uuid
from
enum
import
Enum
from
typing
import
AsyncIterator
,
Tuple
,
Union
...
...
@@ -29,7 +30,7 @@ from vllm.logger import logger as vllm_logger
from
vllm.outputs
import
RequestOutput
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
dynamo.sdk
import
depends
,
dynamo_context
,
dynamo_endpoint
,
service
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
dynamo_endpoint
,
service
class
RequestType
(
Enum
):
...
...
@@ -63,6 +64,7 @@ class Processor(ProcessMixIn):
self
.
tokenizer
,
self
.
model_config
)
self
.
router_mode
=
self
.
engine_args
.
router
self
.
min_workers
=
1
def
_create_tokenizer
(
self
,
engine_args
:
AsyncEngineArgs
)
->
AnyTokenizer
:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
...
...
@@ -78,6 +80,25 @@ class Processor(ProcessMixIn):
)
return
base_tokenizer
@
async_on_start
async
def
async_init
(
self
):
runtime
=
dynamo_context
[
"runtime"
]
comp_ns
,
comp_name
=
VllmWorker
.
dynamo_address
()
# type: ignore
print
(
f
"[Processor] comp_ns:
{
comp_ns
}
, comp_name:
{
comp_name
}
"
)
self
.
worker_client
=
(
await
runtime
.
namespace
(
comp_ns
)
.
component
(
comp_name
)
.
endpoint
(
"generate"
)
.
client
()
)
while
len
(
self
.
worker_client
.
endpoint_ids
())
<
self
.
min_workers
:
print
(
f
"Waiting for workers to be ready.
\n
"
f
" Current:
{
len
(
self
.
worker_client
.
endpoint_ids
())
}
,"
f
" Required:
{
self
.
min_workers
}
"
)
await
asyncio
.
sleep
(
2
)
async
def
_generate
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
...
...
@@ -92,15 +113,6 @@ class Processor(ProcessMixIn):
engine_prompt
,
sampling_params
,
)
=
await
self
.
_parse_raw_request
(
raw_request
)
runtime
=
dynamo_context
[
"runtime"
]
comp_ns
,
comp_name
=
VllmWorker
.
dynamo_address
()
# type: ignore
print
(
f
"[Processor] comp_ns:
{
comp_ns
}
, comp_name:
{
comp_name
}
"
)
worker_client
=
(
await
runtime
.
namespace
(
comp_ns
)
.
component
(
comp_name
)
.
endpoint
(
"generate"
)
.
client
()
)
if
self
.
router_mode
==
"kv"
:
async
for
route_response
in
self
.
router
.
generate
(
Tokens
(
tokens
=
engine_prompt
[
"prompt_token_ids"
]).
model_dump_json
()
...
...
@@ -113,7 +125,7 @@ class Processor(ProcessMixIn):
break
if
worker_id
==
""
:
engine_generator
=
await
worker_client
.
generate
(
engine_generator
=
await
self
.
worker_client
.
generate
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
...
...
@@ -122,7 +134,7 @@ class Processor(ProcessMixIn):
).
model_dump_json
()
)
else
:
engine_generator
=
await
worker_client
.
direct
(
engine_generator
=
await
self
.
worker_client
.
direct
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
...
...
@@ -132,22 +144,21 @@ class Processor(ProcessMixIn):
int
(
worker_id
),
)
elif
self
.
router_mode
==
"random"
:
engine_generator
=
await
worker_client
.
generate
(
engine_generator
=
await
self
.
worker_client
.
generate
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
).
model_dump_json
()
)
elif
self
.
router_mode
==
"round-robin"
:
engine_generator
=
await
self
.
worker_client
.
round_robin
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
).
model_dump_json
()
)
# TODO: add round-robin mode
# elif self.router_mode == "round-robin":
# engine_generator = await self.worker.round_robin(
# vLLMGenerateRequest(
# engine_prompt=engine_prompt,
# sampling_params=sampling_params,
# request_id=request_id,
# ).model_dump_json()
# )
output
=
self
.
_generate_responses
(
engine_generator
,
request_type
)
...
...
deploy/examples/llm/components/worker.py
View file @
dd238a26
...
...
@@ -90,14 +90,14 @@ class VllmWorker:
os
.
environ
[
"VLLM_KV_NAMESPACE"
]
=
"dynamo-init"
os
.
environ
[
"VLLM_KV_COMPONENT"
]
=
class_name
vllm_logger
.
info
(
f
"Generate endpoint ID:
{
VLLM_WORKER_ID
}
"
)
# note: worker_index is 1-based, but CUDA_VISIBLE_DEVICES is 0-based
gpu_idx
=
(
self
.
engine_args
.
cuda_visible_device_offset
+
server_context
.
worker_index
-
1
)
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
f
"
{
gpu_idx
}
"
self
.
metrics_publisher
=
KvMetricsPublisher
()
# note: worker_index is 1-based, but CUDA_VISIBLE_DEVICES is 0-based
gpu_idx
=
(
self
.
engine_args
.
cuda_visible_device_offset
+
server_context
.
worker_index
-
1
)
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
f
"
{
gpu_idx
}
"
self
.
metrics_publisher
=
KvMetricsPublisher
()
@
async_on_start
async
def
async_init
(
self
):
...
...
deploy/examples/llm/configs/monolith/routerless_processor_deployment.yaml
0 → 100644
View file @
dd238a26
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frontend
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo-init.Processor.chat/completions
port
:
8000
Processor
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
router
:
random
VllmWorker
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager
:
true
block-size
:
64
max-model-len
:
16384
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
router
:
random
tensor-parallel-size
:
1
ServiceArgs
:
workers
:
1
deploy/examples/llm/monolith/routerless_processor_deployment.py
0 → 100644
View file @
dd238a26
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
VllmWorker
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment