Unverified Commit f00d700e authored by Alec's avatar Alec Committed by GitHub
Browse files

refactor: remove old examples with old UX (#1899)

parent c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg
spec:
services:
Frontend:
dynamoNamespace: vllm-v0-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/disagg.yaml
VllmWorker:
dynamoNamespace: vllm-v0-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
- -f
- ./configs/disagg.yaml
PrefillWorker:
dynamoNamespace: vllm-v0-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
- -f
- ./configs/disagg.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-planner
spec:
services:
Frontend:
dynamoNamespace: vllm-v0-disagg-planner
componentType: main
replicas: 1
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
VllmWorker:
dynamoNamespace: vllm-v0-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
- -f
- ./configs/disagg_planner.yaml
PrefillWorker:
dynamoNamespace: vllm-v0-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
- -f
- ./configs/disagg_planner.yaml
Planner:
dynamoNamespace: vllm-v0-disagg-planner
replicas: 1
componentType: planner
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "2"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Planner
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Planner
- --Planner.environment=kubernetes
- -f
- ./configs/disagg_planner.yaml
Prometheus:
dynamoNamespace: vllm-v0-disagg-planner
replicas: 1
resources:
requests:
cpu: "1000m"
memory: "1000Mi"
limits:
cpu: "1000m"
memory: "1000Mi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Prometheus
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Prometheus
- -f
- ./configs/disagg_planner.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.worker import VllmWorker
Frontend.link(VllmWorker)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.prefill_worker import PrefillWorker
from components.worker import VllmWorker
Frontend.link(VllmWorker).link(PrefillWorker)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.prefill_worker import PrefillWorker
from components.worker import VllmWorker
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
Frontend.link(VllmWorker).link(PrefillWorker)
Frontend.link(Planner)
Frontend.link(Prometheus)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
from contextlib import asynccontextmanager
from typing import ClassVar, Optional
from dynamo._core import NatsQueue
class NATSQueue:
_instance: ClassVar[Optional["NATSQueue"]] = None
_lock: ClassVar[asyncio.Lock] = asyncio.Lock()
def __init__(
self,
stream_name: str = "default",
nats_server: str = "nats://localhost:4222",
dequeue_timeout: float = 1,
):
self.nats_q = NatsQueue(stream_name, nats_server, dequeue_timeout)
@classmethod
@asynccontextmanager
async def get_instance(
cls,
*,
stream_name: str = "default",
nats_server: str = "nats://localhost:4222",
dequeue_timeout: float = 1,
):
"""Get or create a singleton instance of NATSq"""
# TODO: check if this _lock is needed with GIL
async with cls._lock:
if cls._instance is None:
cls._instance = cls(
stream_name=stream_name,
nats_server=nats_server,
dequeue_timeout=dequeue_timeout,
)
await cls._instance.connect()
try:
yield cls._instance
except Exception:
if cls._instance:
await cls._instance.close()
cls._instance = None
raise
# TODO: check to see if this can be replaced by something like get_instance().close()
@classmethod
async def shutdown(cls):
"""Explicitly close the singleton instance if it exists"""
async with cls._lock:
if cls._instance:
await cls._instance.close()
cls._instance = None
async def connect(self):
await self.nats_q.connect()
async def ensure_connection(self):
await self.nats_q.ensure_connection()
async def close(self):
await self.nats_q.close()
# TODO: is enqueue/dequeue_object a better name for a general queue?
async def enqueue_task(self, task_data: bytes) -> None:
await self.nats_q.enqueue_task(task_data)
async def dequeue_task(self, timeout: Optional[float] = None) -> Optional[bytes]:
return await self.nats_q.dequeue_task(timeout)
async def get_queue_size(self) -> int:
return await self.nats_q.get_queue_size()
async def clear_queue(self) -> int:
try:
cleared_count = 0
# Continue until we can't dequeue any more messages
while True:
# use a small timeout
message = await self.dequeue_task(timeout=0.1)
if message is None:
break
cleared_count += 1
return cleared_count
except Exception as e:
raise RuntimeError(f"Failed to clear queue: {e}")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from contextlib import contextmanager
import msgspec
from vllm.distributed.device_communicators.nixl import NixlMetadata
from dynamo.runtime import DistributedRuntime
METADATA_DIR = "/tmp/nixl"
logger = logging.getLogger(__name__)
@contextmanager
def temp_metadata_file(engine_id, metadata: NixlMetadata):
os.makedirs(METADATA_DIR, exist_ok=True)
path = f"{METADATA_DIR}/{engine_id}.nixl_meta"
with open(path, "wb") as f:
encoded = msgspec.msgpack.encode(metadata)
logger.info(f"Size of encoded metadata: {len(encoded)}")
f.write(encoded)
try:
yield path
finally:
if os.path.exists(path):
os.remove(path)
def find_remote_metadata(engine_id):
# find and load metadata from METADATA_DIR that do not match engine_id
remote_metadata = []
for file in os.listdir(METADATA_DIR):
if file.endswith(".nixl_meta"):
if file.split(".")[0] != engine_id:
with open(os.path.join(METADATA_DIR, file), "rb") as f:
remote_metadata.append(
msgspec.msgpack.decode(f.read(), type=NixlMetadata)
)
return remote_metadata
class NixlMetadataStore:
NIXL_METADATA_KEY = "nixl_metadata"
def __init__(self, namespace: str, runtime: DistributedRuntime) -> None:
self._namespace = namespace
# TODO Remove metadata from etcd on delete
self._stored: set[str] = set()
self._cached: dict[str, NixlMetadata] = {}
self._client = runtime.etcd_client()
if self._client is None:
raise Exception("Cannot be used with static workers")
self._key_prefix = f"{self._namespace}/{NixlMetadataStore.NIXL_METADATA_KEY}"
async def put(self, engine_id, metadata: NixlMetadata):
serialized_metadata = msgspec.msgpack.encode(metadata)
key = "/".join([self._key_prefix, engine_id])
# create with primary lease so that the kv entry will be deleted when the worker shutdowns
try:
# TODO: should we create a series of function in etcd client to use primary lease?
await self._client.kv_create_or_validate(
key, serialized_metadata, self._client.primary_lease_id()
)
except Exception as e:
logger.warning(f"A different metadata exists for engine {engine_id}: {e}")
self._stored.add(engine_id)
async def get(self, engine_id) -> NixlMetadata:
try:
if engine_id in self._cached:
return self._cached[engine_id]
key = "/".join([self._key_prefix, engine_id])
key_values = await self._client.kv_get_prefix(key)
deserialized_metadata = None
for item in key_values:
deserialized_metadata = msgspec.msgpack.decode(
item["value"], type=NixlMetadata
)
break
if deserialized_metadata is None:
raise Exception("metadata not found in etcd")
self._cached[engine_id] = deserialized_metadata
# TODO watch for changes and update cache
# self._client.add_watch_callback(
# key,
# self._watch_callback,
# )
except Exception as e:
raise Exception("Error retrieving metadata for engine {engine_id}") from e
return deserialized_metadata
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import msgspec
from utils.nats_queue import NATSQueue
from vllm.remote_prefill import RemotePrefillRequest
class PrefillQueue(NATSQueue):
"""
A wrapper of NATSQueue for PrefillRequest.
The stream name is forced to be "prefill_queue".
"""
def __init__(
self,
stream_name="prefill_queue",
nats_server: str = "nats://localhost:4222",
dequeue_timeout: float = 1,
):
super().__init__(
stream_name=stream_name,
nats_server=nats_server,
dequeue_timeout=dequeue_timeout,
)
async def enqueue_prefill_request(
self, prefill_request: RemotePrefillRequest
) -> None:
encoded_request = msgspec.json.encode(prefill_request)
await self.enqueue_task(encoded_request)
async def dequeue_prefill_request(self) -> Optional[RemotePrefillRequest]:
encoded_request = await self.dequeue_task()
if encoded_request is not None:
prefill_request = msgspec.json.decode(
encoded_request, type=RemotePrefillRequest
)
return prefill_request
else:
return None
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from pydantic import BaseModel, Field
TokenIdType = int
# TODO: move these to common for all LLMs once we adopt dynamo-run
# derived from lib/llm/src/protocols/common/preprocessor.rs
class StopConditions(BaseModel):
max_tokens: Optional[int] = None
stop: Optional[List[str]] = None
stop_token_ids_hidden: Optional[List[TokenIdType]] = None
min_tokens: Optional[int] = None
ignore_eos: Optional[bool] = None
class SamplingOptions(BaseModel):
n: Optional[int] = None
best_of: Optional[int] = None
presence_penalty: Optional[float] = None
frequency_penalty: Optional[float] = None
repetition_penalty: Optional[float] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
top_k: Optional[int] = None
min_p: Optional[float] = None
use_beam_search: Optional[bool] = None
length_penalty: Optional[float] = None
seed: Optional[int] = None
class PreprocessedRequest(BaseModel):
token_ids: List[TokenIdType]
stop_conditions: StopConditions
sampling_options: SamplingOptions
eos_token_ids: List[TokenIdType] = Field(default_factory=list)
mdc_sum: Optional[str] = None
annotations: List[str] = Field(default_factory=list)
estimated_prefix_hit_num_blocks: Optional[int] = None
class DisaggPreprocessedRequest(BaseModel):
request: PreprocessedRequest
sampling_params: dict
bootstrap_host: str
bootstrap_port: int
bootstrap_room: int
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: rename to avoid ambiguity with vllm package
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
from dynamo.sdk.lib.config import ServiceConfig
class RouterType:
RANDOM = "random"
ROUND_ROBIN = "round-robin"
KV = "kv"
KV_LOAD = "kv-load"
def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
config = ServiceConfig.get_instance()
vllm_args = config.as_args(service_name, prefix=prefix)
parser = FlexibleArgumentParser()
parser.add_argument(
"--router",
type=str,
choices=[
RouterType.RANDOM,
RouterType.ROUND_ROBIN,
RouterType.KV,
RouterType.KV_LOAD,
],
default=RouterType.RANDOM,
help="Router type to use for scheduling requests to workers",
)
parser.add_argument(
"--router-num-threads",
type=int,
default=4,
help="Number of threads to use for the router to process the requests",
)
parser.add_argument(
"--remote-prefill", action="store_true", help="Enable remote prefill"
)
parser.add_argument(
"--conditional-disagg",
action="store_true",
help="Use disaggregated router to decide whether to prefill locally or remotely",
)
parser.add_argument(
"--max-local-prefill-length",
type=int,
default=1000,
help="Maximum length for local prefill. If remote prefill is enabled and the prefill length is greater than this value the request will be sent for remote prefill, otherwise prefill phase will run locally.",
)
parser.add_argument(
"--max-prefill-queue-size",
type=int,
default=3,
help="Maximum queue size for remote prefill. If the prefill queue size is greater than this value, prefill phase of the incoming request will be executed locally.",
)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args(vllm_args)
engine_args = AsyncEngineArgs.from_cli_args(args)
engine_args.router = args.router
engine_args.router_num_threads = args.router_num_threads
engine_args.remote_prefill = args.remote_prefill
engine_args.conditional_disagg = args.conditional_disagg
engine_args.max_local_prefill_length = args.max_local_prefill_length
engine_args.max_prefill_queue_size = args.max_prefill_queue_size
return engine_args
......@@ -395,10 +395,6 @@ class DynamoServeProcess(ManagedProcess):
@pytest.fixture(
params=[
pytest.param("agg", marks=[pytest.mark.vllm, pytest.mark.gpu_1]),
pytest.param("agg_router", marks=[pytest.mark.vllm, pytest.mark.gpu_1]),
pytest.param("disagg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
pytest.param("disagg_router", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
pytest.param("multimodal_agg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
pytest.param("trtllm_agg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1]),
pytest.param(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment