refactor: vLLM to new Python UX (#1983)

Co-authored-by: Graham King <grahamk@nvidia.com>

refactor: vLLM to new Python UX (#1983)
Co-authored-by: Graham King <grahamk@nvidia.com>
f3e3d94a · Alec · GitHub · 9f2356cb · f3e3d94a · f3e3d94a
Unverified Commit f3e3d94a authored Jul 22, 2025 by Alec Committed by GitHub Jul 22, 2025
20 changed files
--- a/examples/vllm/README.md
+++ b/examples/vllm/README.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->
-# LLM Deployment Examples using vLLM
+# LLM Deployment using vLLM
-This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation.
+This directory contains a Dynamo vllm engine and reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation.
 ## Deployment Architectures
@@ -36,11 +24,11 @@ docker compose -f deploy/metrics/docker-compose.yml up -d
 ### Build and Run docker
 ```bash
-./container/build.sh
+./container/build.sh --framework VLLM
 ```
 ```bash
-./container/run.sh -it [--mount-workspace]
+./container/run.sh -it --framework VLLM [--mount-workspace]
 ```
 This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks.
@@ -74,7 +62,7 @@ Note: The above architecture illustrates all the components. The final component
 ```bash
 # requires one gpu
-cd examples/vllm
+cd components/backends/vllm
 bash launch/agg.sh
 ```
@@ -82,7 +70,7 @@ bash launch/agg.sh
 ```bash
 # requires two gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/agg_router.sh
 ```
@@ -90,7 +78,7 @@ bash launch/agg_router.sh
 ```bash
 # requires two gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/disagg.sh
 ```
@@ -98,7 +86,7 @@ bash launch/disagg.sh
 ```bash
 # requires three gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/disagg_router.sh
 ```
@@ -108,7 +96,7 @@ This example is not meant to be performant but showcases dynamo routing to data
 ```bash
 # requires four gpus
-cd examples/vllm
+cd components/backends/vllm
 bash launch/dep.sh
 ```
@@ -146,7 +134,7 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
 Example with disagg:
 ```bash
-cd ~/dynamo/examples/vllm/deploy
+cd ~/dynamo/components/backends/vllm/deploy
 kubectl apply -f disagg.yaml
 ```

--- a/examples/vllm/deepseek-r1.md
+++ b/examples/vllm/deepseek-r1.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->
 # Running Deepseek R1 with Wide EP
@@ -51,4 +39,4 @@ curl localhost:8080/v1/chat/completions \
    "stream": false,
    "max_tokens": 30
  }'
 ```
\ No newline at end of file
--- a/examples/vllm/deploy/agg.yaml
+++ b/examples/vllm/deploy/agg.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - dynamo
            - run
@@ -94,6 +83,6 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/deploy/agg_router.yaml
+++ b/examples/vllm/deploy/agg_router.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - dynamo
            - run
@@ -96,6 +85,6 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/deploy/disagg.yaml
+++ b/examples/vllm/deploy/disagg.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - dynamo
            - run
@@ -94,7 +83,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
    VllmPrefillWorker:
@@ -133,6 +122,6 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/deploy/disagg_planner.yaml
+++ b/examples/vllm/deploy/disagg_planner.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,7 +39,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - dynamo
            - run
@@ -94,7 +83,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
    VllmPrefillWorker:
@@ -133,6 +122,6 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/deploy/disagg_router.yaml
+++ b/examples/vllm/deploy/disagg_router.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -50,16 +39,9 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
-            - dynamo
+            - "python3 -m dynamo.frontend --http-port 8080 --router-mode kv"
-            - run
-            - in=http
-            - out=dyn
-            - --http-port
-            - "8000"
-            - --router-mode
-            - kv
    VllmDecodeWorker:
      dynamoNamespace: vllm-v1-disagg-router
      envFromSecret: hf-token-secret
@@ -96,9 +78,9 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
    VllmPrefillWorker:
      dynamoNamespace: vllm-v1-disagg-router
      envFromSecret: hf-token-secret
@@ -135,6 +117,6 @@ spec:
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm
+          workingDir: /workspace/components/backends/vllm
          args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/launch/agg.sh
+++ b/examples/vllm/launch/agg.sh
@@ -5,7 +5,7 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend &
 # run worker
-python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching
+python -m dynamo.vllm  --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching
--- a/examples/vllm/launch/agg_router.sh
+++ b/examples/vllm/launch/agg_router.sh
@@ -5,9 +5,9 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress
-dynamo run in=http out=dyn --router-mode kv &
+python -m dynamo.frontend --router-mode kv &
 # run workers
-CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
-CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager
--- a/examples/vllm/launch/dep.sh
+++ b/examples/vllm/launch/dep.sh
@@ -5,13 +5,13 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress
-dynamo run in=http out=dyn --router-mode kv &
+python -m dynamo.frontend --router-mode kv &
 # Data Parallel Attention / Expert Parallelism
 # Routing to DP workers managed by Dynamo
 # Chose Qwen3-30B because its a small MOE that can fit on smaller GPUs (L40S for example)
 for i in {0..3}; do
-    CUDA_VISIBLE_DEVICES=$i python3 components/main.py \
+    CUDA_VISIBLE_DEVICES=$i python3 -m dynamo.vllm \
    --model Qwen/Qwen3-30B-A3B \
    --data-parallel-rank $i \
    --data-parallel-size 4 \

--- a/examples/vllm/launch/disagg.sh
+++ b/examples/vllm/launch/disagg.sh
@@ -5,11 +5,11 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend --router-mode kv &
-CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
-CUDA_VISIBLE_DEVICES=1 python3 components/main.py \
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model Qwen/Qwen3-0.6B \
    --enforce-eager \
    --is-prefill-worker
--- a/examples/vllm/launch/disagg_router.sh
+++ b/examples/vllm/launch/disagg_router.sh
@@ -6,14 +6,14 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress
-dynamo run in=http out=dyn --router-mode kv &
+python -m dynamo.frontend --router-mode kv &
 # routing will happen between the two decode workers
-CUDA_VISIBLE_DEVICES=0 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
-CUDA_VISIBLE_DEVICES=1 python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager &
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
-CUDA_VISIBLE_DEVICES=2 python3 components/main.py \
+CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
    --model Qwen/Qwen3-0.6B \
    --enforce-eager \
    --is-prefill-worker
--- a/examples/vllm/launch/dsr1_dep.sh
+++ b/examples/vllm/launch/dsr1_dep.sh
@@ -76,7 +76,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
 # run ingress if it's node 0
 if [ $NODE_RANK -eq 0 ]; then
-    DYN_LOG=debug dynamo-run in=http out=dyn --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
+    DYN_LOG=debug python -m dynamo.frontend --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
 fi
 mkdir -p $LOG_DIR
@@ -89,7 +89,7 @@ for ((i=0; i<GPUS_PER_NODE; i++)); do
        VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
        VLLM_USE_DEEP_GEMM=1 \
        VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
-        python3 components/main.py \
+        python3 -m dynamo.vllm \
        --model deepseek-ai/DeepSeek-R1 \
        --data_parallel_size $DATA_PARALLEL_SIZE \
        --data-parallel-rank $dp_rank \

--- a/examples/vllm/multi-node.md
+++ b/examples/vllm/multi-node.md
 <!--
 SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
 -->
 # Multi-node Examples
@@ -63,10 +51,10 @@ Deploy vLLM workers across multiple nodes for horizontal scaling:
 **Node 1 (Head Node)**: Run ingress and first worker
 ```bash
 # Start ingress
-dynamo run in=http out=dyn
+python -m dynamo.frontend --router-mode kv
 # Start vLLM worker
-python3 components/main.py \
+python -m dynamo.vllm \
  --model meta-llama/Llama-3.3-70B-Instruct \
  --tensor-parallel-size 8 \
  --enforce-eager
@@ -75,7 +63,7 @@ python3 components/main.py \
 **Node 2**: Run additional worker
 ```bash
 # Start vLLM worker
-python3 components/main.py \
+python -m dynamo.vllm \
  --model meta-llama/Llama-3.3-70B-Instruct \
  --tensor-parallel-size 8 \
  --enforce-eager
@@ -88,10 +76,10 @@ Deploy prefill and decode workers on separate nodes for optimized resource utili
 **Node 1**: Run ingress and prefill workers
 ```bash
 # Start ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend --router-mode kv &
 # Start prefill worker
-python3 components/main.py \
+python -m dynamo.vllm \
  --model meta-llama/Llama-3.3-70B-Instruct
  --tensor-parallel-size 8 \
  --enforce-eager
@@ -100,7 +88,7 @@ python3 components/main.py \
 **Node 2**: Run decode workers
 ```bash
 # Start decode worker
-python3 components/main.py \
+python -m dynamo.vllm \
  --model meta-llama/Llama-3.3-70B-Instruct
  --tensor-parallel-size 8 \
  --enforce-eager \
@@ -117,6 +105,6 @@ For models requiring more GPUs than available on a single node such as tensor-pa
 **Node 1**: First part of tensor-parallel model
 ```bash
 # Start ingress
-dynamo run in=http out=dyn &
+python -m dynamo.frontend --router-mode kv &
 ```
--- a/components/backends/vllm/requirements.txt
+++ b/components/backends/vllm/requirements.txt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+uvloop
+vllm==0.9.2
--- a/components/backends/vllm/src/dynamo/vllm/__init__.py
+++ b/components/backends/vllm/src/dynamo/vllm/__init__.py
--- a/components/backends/vllm/src/dynamo/vllm/__main__.py
+++ b/components/backends/vllm/src/dynamo/vllm/__main__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from dynamo.vllm.main import main
+if __name__ == "__main__":
+    main()
--- a/examples/vllm/components/args.py
+++ b/examples/vllm/components/args.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import asyncio
 import json

--- a/examples/vllm/components/handlers.py
+++ b/examples/vllm/components/handlers.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import asyncio
 import logging
@@ -21,12 +9,13 @@ from copy import deepcopy
 from typing import AsyncGenerator
 import msgspec
-from protocol import MyRequestOutput
 from vllm.inputs import TokensPrompt
 from vllm.sampling_params import SamplingParams
 from dynamo.runtime.logging import configure_dynamo_logging
+from .protocol import MyRequestOutput
 configure_dynamo_logging()
 logger = logging.getLogger(__name__)

--- a/examples/vllm/components/main.py
+++ b/examples/vllm/components/main.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import asyncio
 import logging
@@ -19,9 +7,6 @@ import os
 import signal
 import uvloop
-from args import Config, configure_ports_with_etcd, overwrite_args, parse_args
-from handlers import DecodeWorkerHandler, PrefillWorkerHandler
-from publisher import StatLoggerFactory
 from vllm.distributed.kv_events import ZmqEventPublisher
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.async_llm import AsyncLLM
@@ -35,6 +20,10 @@ from dynamo.llm import (
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
+from .args import Config, configure_ports_with_etcd, overwrite_args, parse_args
+from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
+from .publisher import StatLoggerFactory
 configure_dynamo_logging()
 logger = logging.getLogger(__name__)
@@ -211,6 +200,9 @@ async def init(runtime: DistributedRuntime, config: Config):
        handler.cleanup()
+def main():
+    uvloop.run(worker())
 if __name__ == "__main__":
-    uvloop.install()
+    main()
-    asyncio.run(worker())