refactor: remove old examples with old UX (#1899)

f00d700e · Alec · GitHub · c7080419 · c7080419 · c7080419
Unverified Commit f00d700e authored Jul 14, 2025 by Alec Committed by GitHub Jul 14, 2025
11 changed files
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: disagg
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: vllm-v0-disagg
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "2Gi"
-        limits:
-          cpu: "1"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/disagg.yaml
-    VllmWorker:
-      dynamoNamespace: vllm-v0-disagg
-      envFromSecret: hf-token-secret
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
-            - -f
-            - ./configs/disagg.yaml
-    PrefillWorker:
-      dynamoNamespace: vllm-v0-disagg
-      envFromSecret: hf-token-secret
-      replicas: 1
-      resources:
-        requests:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-        limits:
-          cpu: "10"
-          memory: "20Gi"
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:PrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - PrefillWorker
-            - -f
-            - ./configs/disagg.yaml
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: disagg-planner
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: vllm-v0-disagg-planner
-      componentType: main
-      replicas: 1
-      resources:
-        requests:
-          cpu: "2"
-          memory: "4Gi"
-        limits:
-          cpu: "2"
-          memory: "4Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-
-    VllmWorker:
-      dynamoNamespace: vllm-v0-disagg-planner
-      envFromSecret: hf-token-secret
-      replicas: 1
-      resources:
-        requests:
-          cpu: "20"
-          memory: "40Gi"
-          gpu: "2"
-        limits:
-          cpu: "20"
-          memory: "40Gi"
-          gpu: "2"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:VllmWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmWorker
-            - -f
-            - ./configs/disagg_planner.yaml
-    PrefillWorker:
-      dynamoNamespace: vllm-v0-disagg-planner
-      envFromSecret: hf-token-secret
-      replicas: 1
-      resources:
-        requests:
-          cpu: "20"
-          memory: "40Gi"
-          gpu: "2"
-        limits:
-          cpu: "20"
-          memory: "40Gi"
-          gpu: "2"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:PrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - PrefillWorker
-            - -f
-            - ./configs/disagg_planner.yaml
-    Planner:
-      dynamoNamespace: vllm-v0-disagg-planner
-      replicas: 1
-      componentType: planner
-      resources:
-        requests:
-          cpu: "2"
-          memory: "2Gi"
-        limits:
-          cpu: "2"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:Planner
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Planner
-            - --Planner.environment=kubernetes
-            - -f
-            - ./configs/disagg_planner.yaml
-    Prometheus:
-      dynamoNamespace: vllm-v0-disagg-planner
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1000m"
-          memory: "1000Mi"
-        limits:
-          cpu: "1000m"
-          memory: "1000Mi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v0
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:Prometheus
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Prometheus
-            - -f
-            - ./configs/disagg_planner.yaml
--- a/examples/vllm_v0/graphs/agg.py
+++ b/examples/vllm_v0/graphs/agg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.worker import VllmWorker
-
-Frontend.link(VllmWorker)
--- a/examples/vllm_v0/graphs/disagg.py
+++ b/examples/vllm_v0/graphs/disagg.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.prefill_worker import PrefillWorker
-from components.worker import VllmWorker
-
-Frontend.link(VllmWorker).link(PrefillWorker)
--- a/examples/vllm_v0/graphs/disagg_planner.py
+++ b/examples/vllm_v0/graphs/disagg_planner.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from components.frontend import Frontend
-from components.prefill_worker import PrefillWorker
-from components.worker import VllmWorker
-
-from dynamo.planner.planner_sla import Planner
-from dynamo.planner.prometheus import Prometheus
-
-Frontend.link(VllmWorker).link(PrefillWorker)
-Frontend.link(Planner)
-Frontend.link(Prometheus)
--- a/examples/vllm_v0/utils/nats_queue.py
+++ b/examples/vllm_v0/utils/nats_queue.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-from contextlib import asynccontextmanager
-from typing import ClassVar, Optional
-
-from dynamo._core import NatsQueue
-
-
-class NATSQueue:
-    _instance: ClassVar[Optional["NATSQueue"]] = None
-    _lock: ClassVar[asyncio.Lock] = asyncio.Lock()
-
-    def __init__(
-        self,
-        stream_name: str = "default",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        self.nats_q = NatsQueue(stream_name, nats_server, dequeue_timeout)
-
-    @classmethod
-    @asynccontextmanager
-    async def get_instance(
-        cls,
-        *,
-        stream_name: str = "default",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        """Get or create a singleton instance of NATSq"""
-        # TODO: check if this _lock is needed with GIL
-        async with cls._lock:
-            if cls._instance is None:
-                cls._instance = cls(
-                    stream_name=stream_name,
-                    nats_server=nats_server,
-                    dequeue_timeout=dequeue_timeout,
-                )
-                await cls._instance.connect()
-            try:
-                yield cls._instance
-            except Exception:
-                if cls._instance:
-                    await cls._instance.close()
-                cls._instance = None
-                raise
-
-    # TODO: check to see if this can be replaced by something like get_instance().close()
-    @classmethod
-    async def shutdown(cls):
-        """Explicitly close the singleton instance if it exists"""
-        async with cls._lock:
-            if cls._instance:
-                await cls._instance.close()
-                cls._instance = None
-
-    async def connect(self):
-        await self.nats_q.connect()
-
-    async def ensure_connection(self):
-        await self.nats_q.ensure_connection()
-
-    async def close(self):
-        await self.nats_q.close()
-
-    # TODO: is enqueue/dequeue_object a better name for a general queue?
-    async def enqueue_task(self, task_data: bytes) -> None:
-        await self.nats_q.enqueue_task(task_data)
-
-    async def dequeue_task(self, timeout: Optional[float] = None) -> Optional[bytes]:
-        return await self.nats_q.dequeue_task(timeout)
-
-    async def get_queue_size(self) -> int:
-        return await self.nats_q.get_queue_size()
-
-    async def clear_queue(self) -> int:
-        try:
-            cleared_count = 0
-            # Continue until we can't dequeue any more messages
-            while True:
-                # use a small timeout
-                message = await self.dequeue_task(timeout=0.1)
-                if message is None:
-                    break
-                cleared_count += 1
-            return cleared_count
-        except Exception as e:
-            raise RuntimeError(f"Failed to clear queue: {e}")
--- a/examples/vllm_v0/utils/nixl.py
+++ b/examples/vllm_v0/utils/nixl.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-from contextlib import contextmanager
-
-import msgspec
-from vllm.distributed.device_communicators.nixl import NixlMetadata
-
-from dynamo.runtime import DistributedRuntime
-
-METADATA_DIR = "/tmp/nixl"
-
-logger = logging.getLogger(__name__)
-
-
-@contextmanager
-def temp_metadata_file(engine_id, metadata: NixlMetadata):
-    os.makedirs(METADATA_DIR, exist_ok=True)
-    path = f"{METADATA_DIR}/{engine_id}.nixl_meta"
-    with open(path, "wb") as f:
-        encoded = msgspec.msgpack.encode(metadata)
-        logger.info(f"Size of encoded metadata: {len(encoded)}")
-        f.write(encoded)
-    try:
-        yield path
-    finally:
-        if os.path.exists(path):
-            os.remove(path)
-
-
-def find_remote_metadata(engine_id):
-    # find and load metadata from METADATA_DIR that do not match engine_id
-    remote_metadata = []
-    for file in os.listdir(METADATA_DIR):
-        if file.endswith(".nixl_meta"):
-            if file.split(".")[0] != engine_id:
-                with open(os.path.join(METADATA_DIR, file), "rb") as f:
-                    remote_metadata.append(
-                        msgspec.msgpack.decode(f.read(), type=NixlMetadata)
-                    )
-    return remote_metadata
-
-
-class NixlMetadataStore:
-    NIXL_METADATA_KEY = "nixl_metadata"
-
-    def __init__(self, namespace: str, runtime: DistributedRuntime) -> None:
-        self._namespace = namespace
-
-        # TODO Remove metadata from etcd on delete
-        self._stored: set[str] = set()
-
-        self._cached: dict[str, NixlMetadata] = {}
-        self._client = runtime.etcd_client()
-        if self._client is None:
-            raise Exception("Cannot be used with static workers")
-        self._key_prefix = f"{self._namespace}/{NixlMetadataStore.NIXL_METADATA_KEY}"
-
-    async def put(self, engine_id, metadata: NixlMetadata):
-        serialized_metadata = msgspec.msgpack.encode(metadata)
-        key = "/".join([self._key_prefix, engine_id])
-        # create with primary lease so that the kv entry will be deleted when the worker shutdowns
-        try:
-            # TODO: should we create a series of function in etcd client to use primary lease?
-            await self._client.kv_create_or_validate(
-                key, serialized_metadata, self._client.primary_lease_id()
-            )
-        except Exception as e:
-            logger.warning(f"A different metadata exists for engine {engine_id}: {e}")
-        self._stored.add(engine_id)
-
-    async def get(self, engine_id) -> NixlMetadata:
-        try:
-            if engine_id in self._cached:
-                return self._cached[engine_id]
-
-            key = "/".join([self._key_prefix, engine_id])
-            key_values = await self._client.kv_get_prefix(key)
-            deserialized_metadata = None
-
-            for item in key_values:
-                deserialized_metadata = msgspec.msgpack.decode(
-                    item["value"], type=NixlMetadata
-                )
-                break
-
-            if deserialized_metadata is None:
-                raise Exception("metadata not found in etcd")
-
-            self._cached[engine_id] = deserialized_metadata
-
-            # TODO watch for changes and update cache
-
-            # self._client.add_watch_callback(
-            #     key,
-            #     self._watch_callback,
-            # )
-
-        except Exception as e:
-            raise Exception("Error retrieving metadata for engine {engine_id}") from e
-
-        return deserialized_metadata
--- a/examples/vllm_v0/utils/prefill_queue.py
+++ b/examples/vllm_v0/utils/prefill_queue.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-import msgspec
-from utils.nats_queue import NATSQueue
-from vllm.remote_prefill import RemotePrefillRequest
-
-
-class PrefillQueue(NATSQueue):
-    """
-    A wrapper of NATSQueue for PrefillRequest.
-    The stream name is forced to be "prefill_queue".
-    """
-
-    def __init__(
-        self,
-        stream_name="prefill_queue",
-        nats_server: str = "nats://localhost:4222",
-        dequeue_timeout: float = 1,
-    ):
-        super().__init__(
-            stream_name=stream_name,
-            nats_server=nats_server,
-            dequeue_timeout=dequeue_timeout,
-        )
-
-    async def enqueue_prefill_request(
-        self, prefill_request: RemotePrefillRequest
-    ) -> None:
-        encoded_request = msgspec.json.encode(prefill_request)
-        await self.enqueue_task(encoded_request)
-
-    async def dequeue_prefill_request(self) -> Optional[RemotePrefillRequest]:
-        encoded_request = await self.dequeue_task()
-        if encoded_request is not None:
-            prefill_request = msgspec.json.decode(
-                encoded_request, type=RemotePrefillRequest
-            )
-            return prefill_request
-        else:
-            return None
--- a/examples/vllm_v0/utils/protocol.py
+++ b/examples/vllm_v0/utils/protocol.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from pydantic import BaseModel, Field
-
-TokenIdType = int
-
-
-# TODO: move these to common for all LLMs once we adopt dynamo-run
-# derived from lib/llm/src/protocols/common/preprocessor.rs
-class StopConditions(BaseModel):
-    max_tokens: Optional[int] = None
-    stop: Optional[List[str]] = None
-    stop_token_ids_hidden: Optional[List[TokenIdType]] = None
-    min_tokens: Optional[int] = None
-    ignore_eos: Optional[bool] = None
-
-
-class SamplingOptions(BaseModel):
-    n: Optional[int] = None
-    best_of: Optional[int] = None
-    presence_penalty: Optional[float] = None
-    frequency_penalty: Optional[float] = None
-    repetition_penalty: Optional[float] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    use_beam_search: Optional[bool] = None
-    length_penalty: Optional[float] = None
-    seed: Optional[int] = None
-
-
-class PreprocessedRequest(BaseModel):
-    token_ids: List[TokenIdType]
-    stop_conditions: StopConditions
-    sampling_options: SamplingOptions
-    eos_token_ids: List[TokenIdType] = Field(default_factory=list)
-    mdc_sum: Optional[str] = None
-    annotations: List[str] = Field(default_factory=list)
-    estimated_prefix_hit_num_blocks: Optional[int] = None
-
-
-class DisaggPreprocessedRequest(BaseModel):
-    request: PreprocessedRequest
-    sampling_params: dict
-    bootstrap_host: str
-    bootstrap_port: int
-    bootstrap_room: int
--- a/examples/vllm_v0/utils/vllm.py
+++ b/examples/vllm_v0/utils/vllm.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: rename to avoid ambiguity with vllm package
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-from dynamo.sdk.lib.config import ServiceConfig
-
-
-class RouterType:
-    RANDOM = "random"
-    ROUND_ROBIN = "round-robin"
-    KV = "kv"
-    KV_LOAD = "kv-load"
-
-
-def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
-    config = ServiceConfig.get_instance()
-    vllm_args = config.as_args(service_name, prefix=prefix)
-    parser = FlexibleArgumentParser()
-    parser.add_argument(
-        "--router",
-        type=str,
-        choices=[
-            RouterType.RANDOM,
-            RouterType.ROUND_ROBIN,
-            RouterType.KV,
-            RouterType.KV_LOAD,
-        ],
-        default=RouterType.RANDOM,
-        help="Router type to use for scheduling requests to workers",
-    )
-    parser.add_argument(
-        "--router-num-threads",
-        type=int,
-        default=4,
-        help="Number of threads to use for the router to process the requests",
-    )
-    parser.add_argument(
-        "--remote-prefill", action="store_true", help="Enable remote prefill"
-    )
-    parser.add_argument(
-        "--conditional-disagg",
-        action="store_true",
-        help="Use disaggregated router to decide whether to prefill locally or remotely",
-    )
-    parser.add_argument(
-        "--max-local-prefill-length",
-        type=int,
-        default=1000,
-        help="Maximum length for local prefill. If remote prefill is enabled and the prefill length is greater than this value the request will be sent for remote prefill, otherwise prefill phase will run locally.",
-    )
-    parser.add_argument(
-        "--max-prefill-queue-size",
-        type=int,
-        default=3,
-        help="Maximum queue size for remote prefill. If the prefill queue size is greater than this value, prefill phase of the incoming request will be executed locally.",
-    )
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args(vllm_args)
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine_args.router = args.router
-    engine_args.router_num_threads = args.router_num_threads
-    engine_args.remote_prefill = args.remote_prefill
-    engine_args.conditional_disagg = args.conditional_disagg
-    engine_args.max_local_prefill_length = args.max_local_prefill_length
-    engine_args.max_prefill_queue_size = args.max_prefill_queue_size
-    return engine_args
--- a/tests/serve/test_dynamo_serve.py
+++ b/tests/serve/test_dynamo_serve.py
@@ -395,10 +395,6 @@ class DynamoServeProcess(ManagedProcess):

 @pytest.fixture(
    params=[
-        pytest.param("agg", marks=[pytest.mark.vllm, pytest.mark.gpu_1]),
-        pytest.param("agg_router", marks=[pytest.mark.vllm, pytest.mark.gpu_1]),
-        pytest.param("disagg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
-        pytest.param("disagg_router", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
        pytest.param("multimodal_agg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
        pytest.param("trtllm_agg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1]),
        pytest.param(