Unverified Commit 47477909 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: deprecate sdk as dependency (#2149)

parent 095ea3e7
# SPDX-FileCopyrightText: Copyright (c) 2020 Atalaya Tech. Inc
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
from __future__ import annotations
import importlib
import logging
import os
import sys
from typing import Optional, TypeVar
import yaml
from dynamo.sdk.core.protocol.deployment import Service
from dynamo.sdk.core.protocol.interface import ServiceInterface
logger = logging.getLogger(__name__)
T = TypeVar("T", bound=object)
def find_and_load_service(
import_str: str,
working_dir: Optional[str] = None,
) -> ServiceInterface:
"""Load a DynamoService instance from source code by providing an import string.
Args:
import_str: String in format "module[:attribute]" or "path/to/file.py[:attribute]"
Examples:
"graphs:disagg:Frontend"
"fraud_detector:svc"
"./path/to/service.py:MyService"
"fraud_detector" # Will find the root service if only one exists
working_dir: Optional directory to use as base for imports. Defaults to cwd.
Returns:
The loaded DynamoService instance
Raises:
ImportError: If module cannot be imported
ValueError: If service cannot be found or multiple root services exist
"""
logger.debug(f"Loading service from import string: {import_str}")
logger.debug(f"Working directory: {working_dir or os.getcwd()}")
sys_path_modified = False
prev_cwd = None
if working_dir is not None:
prev_cwd = os.getcwd()
working_dir = os.path.realpath(os.path.expanduser(working_dir))
logger.debug(f"Changing working directory to: {working_dir}")
os.chdir(working_dir)
else:
working_dir = os.getcwd()
if working_dir not in sys.path:
logger.debug(f"Adding {working_dir} to sys.path")
sys.path.insert(0, working_dir)
sys_path_modified = True
try:
return _do_import(import_str, working_dir)
finally:
if sys_path_modified and working_dir:
logger.debug(f"Removing {working_dir} from sys.path")
sys.path.remove(working_dir)
if prev_cwd is not None:
logger.debug(f"Restoring working directory to: {prev_cwd}")
os.chdir(prev_cwd)
def _do_import(import_str: str, working_dir: str) -> ServiceInterface:
"""Internal function to handle the actual import logic"""
import_path, _, attrs_str = import_str.partition(":")
logger.debug(f"Parsed import string - path: {import_path}, attributes: {attrs_str}")
if not import_path:
raise ValueError(
f'Invalid import string "{import_str}", must be in format '
'"<module>:<attribute>" or "<module>"'
)
# Handle file path vs module name imports
if os.path.isfile(import_path):
logger.debug(f"Importing from file path: {import_path}")
import_path = os.path.realpath(import_path)
if not import_path.startswith(working_dir):
raise ImportError(
f'Module "{import_path}" not found in working directory "{working_dir}"'
)
file_name, ext = os.path.splitext(import_path)
if ext != ".py":
raise ImportError(
f'Invalid module extension "{ext}", only ".py" files are supported'
)
# Build module name from path components
module_parts = []
path = file_name
while True:
path, name = os.path.split(path)
module_parts.append(name)
if (
not os.path.exists(os.path.join(path, "__init__.py"))
or path == working_dir
):
break
module_name = ".".join(module_parts[::-1])
logger.debug(f"Constructed module name from path: {module_name}")
else:
logger.debug(f"Importing from module name: {import_path}")
module_name = import_path
try:
logger.debug(f"Attempting to import module: {module_name}")
module = importlib.import_module(module_name)
except ImportError as e:
raise ImportError(f'Failed to import module "{module_name}": {e}')
# If no specific attribute given, find the root service
if not attrs_str:
logger.debug("No attributes specified, searching for root service")
services = [
(name, obj)
for name, obj in module.__dict__.items()
if isinstance(obj, ServiceInterface)
]
logger.debug(f"Found {len(services)} DynamoService instances")
if not services:
raise ValueError(
f"No DynamoService instances found in module '{module_name}'"
)
# Find root services (those that aren't dependencies of other services)
dependents = set()
for _, svc in services:
for dep in svc.dependencies.values():
if dep.on is not None:
dependents.add(dep.on)
root_services = [(n, s) for n, s in services if s not in dependents]
logger.debug(f"Found {len(root_services)} root services")
if not root_services:
raise ValueError(
f"No root DynamoService found in module '{module_name}'. "
"All services are dependencies of other services."
)
if len(root_services) > 1:
names = [n for n, _ in root_services]
raise ValueError(
f"Multiple root services found in module '{module_name}': {names}. "
"Please specify which service to use with '<module>:<service_name>'"
)
_, instance = root_services[0]
logger.debug(f"Selected root service: {instance}")
else:
# Navigate through dot-separated attributes
logger.debug(f"Navigating attributes: {attrs_str}")
instance = module
for attr in attrs_str.split("."):
try:
if isinstance(instance, ServiceInterface):
logger.debug(f"Following dependency link: {attr}")
instance = instance.dependencies[attr].on
else:
logger.debug(f"Getting attribute: {attr}")
instance = getattr(instance, attr)
except (AttributeError, KeyError):
raise ValueError(f'Attribute "{attr}" not found in "{module_name}"')
# Set import string for debugging/logging
if not hasattr(instance, "_import_str"):
import_str_val = f"{module_name}:{attrs_str}" if attrs_str else module_name
logger.debug(f"Setting _import_str to: {import_str_val}")
object.__setattr__(instance, "_import_str", import_str_val)
return instance
def _get_dir_size(path: str) -> int:
total = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
if os.path.isfile(fp):
total += os.path.getsize(fp)
logger.debug(f"Total size of {path}: {total} bytes")
return total
def load_entry_service(
graph_tag: str, build_dir: str = "~/.dynamo/packages"
) -> Service:
"""
Given a built graph tag (e.g. frontend:2uk2fwzvqsswvs7t), load the entry service as a deployment Service instance.
"""
if ":" not in graph_tag:
raise ValueError("graph_tag must be in the form name:version")
name, version = graph_tag.split(":", 1)
graph_dir = os.path.expanduser(f"{build_dir}/{name}/{version}")
if not os.path.isdir(graph_dir):
raise FileNotFoundError(f"Graph directory not found: {graph_dir}")
config_path = os.path.join(graph_dir, "dynamo.yaml")
if not os.path.isfile(config_path):
raise FileNotFoundError(f"Graph config (dynamo.yaml) not found in {graph_dir}")
with open(config_path, encoding="utf-8") as f:
graph_cfg = yaml.safe_load(f)
# Add src_dir to sys.path if needed
src_dir = os.path.join(graph_dir, "src")
if src_dir not in sys.path:
sys.path.insert(0, src_dir)
# Compute size_bytes as the total size of the dynamo directory
size_bytes = _get_dir_size(graph_dir)
service_name = graph_cfg.get("service")
for svc in graph_cfg.get("services", []):
svc_name = svc["name"]
if svc_name != graph_cfg.get("entry_service"):
continue
entry_service = Service(
service_name=service_name,
name=svc_name,
namespace=svc.get("dynamo", {}).get("namespace", "default"),
version=version,
path=graph_dir,
envs=graph_cfg.get("envs", []),
apis={},
size_bytes=size_bytes,
)
return entry_service
raise ValueError("No entry service found in the graph")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: this should be used for planner as well and should leverage proper nvml bindings
from __future__ import annotations
import logging
import typing as t
from dataclasses import dataclass
import psutil
try:
import pynvml
PYNVML_AVAILABLE = True
except (ImportError, ModuleNotFoundError):
PYNVML_AVAILABLE = False
logger = logging.getLogger(__name__)
# Constants
NVIDIA_GPU = "nvidia.com/gpu"
class ResourceError(Exception):
"""Base exception for resource-related errors."""
pass
@dataclass
class GPUProcess:
"""Information about a process running on a GPU."""
pid: int
used_memory: int # in bytes
name: str = ""
def __post_init__(self):
"""Get process name if available."""
try:
self.name = psutil.Process(self.pid).name()
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
class GPUInfo:
"""Information about a specific GPU device."""
def __init__(self, index: int, total_memory: int, name: str, uuid: str):
self.index = index
self.total_memory = total_memory # in bytes
self.name = name
self.uuid = uuid
self.available = True # Can be set to False if GPU is reserved/in use
self.utilization = 0 # in percent (0-100)
self.processes: list[GPUProcess] = []
def __repr__(self) -> str:
return f"GPUInfo(index={self.index}, name='{self.name}', total_memory={self.total_memory/1024/1024:.0f}MB, available={self.available})"
class GPUManager:
"""
Manages GPU resources using NVML.
This class provides methods to:
- Discover available GPUs
- Query GPU properties and status
- Track GPU processes
- Allocate and release GPUs
- Generate CUDA_VISIBLE_DEVICES environment variables
"""
def __init__(self):
"""Initialize the GPU manager."""
self.gpus: list[GPUInfo] = []
self._initialized = False
# List to track fractional GPU allocations
# Each item is (gpu_index, fraction_used, fraction_size)
# E.g. (0, 0.5, 0.5) means GPU 0 has 0.5 used with fraction size of 0.5
self._gpu_fractions: list[tuple[int, float, float]] = []
self._init_nvml()
def _init_nvml(self):
"""Initialize NVML and discover GPUs."""
if not PYNVML_AVAILABLE:
logger.warning("PyNVML not available. GPU functionality will be limited.")
return
try:
pynvml.nvmlInit()
self._initialized = True
self._discover_gpus()
except (
pynvml.NVMLError_LibraryNotFound,
pynvml.NVMLError_DriverNotLoaded,
OSError,
) as e:
logger.warning(f"Failed to initialize NVML: {e}")
self._initialized = False
def __del__(self):
"""Clean up NVML."""
if self._initialized:
try:
pynvml.nvmlShutdown()
except Exception: # pylint: disable=broad-except
pass
def _discover_gpus(self):
"""Discover available GPUs and their properties."""
if not self._initialized:
return
try:
device_count = pynvml.nvmlDeviceGetCount()
self.gpus = []
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
uuid = pynvml.nvmlDeviceGetUUID(handle)
gpu_info = GPUInfo(
index=i, total_memory=memory_info.total, name=name, uuid=uuid
)
try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu_info.utilization = utilization.gpu
except pynvml.NVMLError:
logger.debug(f"Could not get utilization for GPU {i}")
# Get processes running on GPU
try:
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
gpu_info.processes = [
GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory)
for p in processes
]
except pynvml.NVMLError:
logger.debug(f"Could not get processes for GPU {i}")
self.gpus.append(gpu_info)
logger.info(f"Discovered {len(self.gpus)} GPUs")
except pynvml.NVMLError as e:
logger.warning(f"Error discovering GPUs: {e}")
def update_gpu_stats(self):
"""Update GPU statistics (utilization, memory etc.)."""
if not self._initialized:
return
for gpu in self.gpus:
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu.index)
# Update memory info
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu.total_memory = memory_info.total
# Update utilization
try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu.utilization = utilization.gpu
except pynvml.NVMLError:
pass
# Update processes
try:
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
gpu.processes = [
GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory)
for p in processes
]
except pynvml.NVMLError:
pass
except pynvml.NVMLError as e:
logger.warning(f"Error updating GPU {gpu.index} stats: {e}")
def get_gpu_count(self) -> int:
"""Return the number of available GPUs."""
return len(self.gpus)
def get_available_gpus(self) -> list[int]:
"""Return a list of available GPU indices."""
return [gpu.index for gpu in self.gpus if gpu.available]
def get_gpu_memory(self, index: int) -> tuple[int, int]:
"""
Return (total memory, free memory) in bytes for a specific GPU.
Args:
index: GPU index
Returns:
Tuple of (total memory, free memory) in bytes
"""
if not self._initialized or index >= len(self.gpus):
return (0, 0)
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return (memory_info.total, memory_info.free)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
return (0, 0)
def reset_allocations(self):
"""Reset all GPU allocations."""
self._gpu_fractions = []
for gpu in self.gpus:
gpu.available = True
def get_gpu_stats(self) -> list[dict[str, t.Any]]:
"""
Get detailed statistics for all GPUs.
Returns:
List of dictionaries with GPU statistics
"""
self.update_gpu_stats()
stats = []
for gpu in self.gpus:
total_memory, free_memory = self.get_gpu_memory(gpu.index)
stats.append(
{
"index": gpu.index,
"name": gpu.name,
"uuid": gpu.uuid,
"total_memory": total_memory,
"free_memory": free_memory,
"used_memory": total_memory - free_memory,
"memory_utilization": (total_memory - free_memory)
/ total_memory
* 100
if total_memory > 0
else 0,
"gpu_utilization": gpu.utilization,
"process_count": len(gpu.processes),
"processes": [
{
"pid": process.pid,
"name": process.name,
"used_memory": process.used_memory,
}
for process in gpu.processes
],
"available": gpu.available,
}
)
return stats
def system_resources() -> dict[str, t.Any]:
"""
Get available GPU resources
Returns:
Dictionary of resources with keys 'nvidia.com/gpu'
"""
resources = {}
# Get GPU resources
gpu_manager = GPUManager()
resources[NVIDIA_GPU] = gpu_manager.get_available_gpus()
return resources
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import tarfile
from datetime import datetime
from typing import Optional
import requests
from dynamo.sdk.core.protocol.deployment import Service
REQUEST_TIMEOUT = 20
def get_host_port():
"""Gets host and port from environment variables. Defaults to 0.0.0.0:8000."""
port = int(os.environ.get("DYNAMO_PORT", 8000))
host = os.environ.get("DYNAMO_HOST", "0.0.0.0")
return host, port
def get_system_app_host_port():
"""Gets host and port for system app from environment variables. Defaults to choosing a random port."""
port = int(os.environ.get("DYNAMO_SYSTEM_APP_PORT", 0))
host = os.environ.get("DYNAMO_SYSTEM_APP_HOST", "0.0.0.0")
return host, port
def upload_graph(
endpoint: str,
graph: str,
entry_service: Service,
session: Optional[requests.Session] = None,
**kwargs,
) -> None:
"""Upload the entire graph as a single component/version, with a manifest of all services."""
session = session or requests.Session()
parts = graph.split(":")
if len(parts) != 2:
raise ValueError(
f"`graph` must be in '<name>:<version>' format, got '{graph}'."
)
graph_name, graph_version = parts
# Check if component exists before POST
comp_url = f"{endpoint}/api/v1/dynamo_components"
comp_get_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}"
comp_exists = False
comp_resp = session.get(comp_get_url, timeout=REQUEST_TIMEOUT)
if comp_resp.status_code == 200:
comp_exists = True
elif comp_resp.status_code == 404:
comp_exists = False
else:
raise RuntimeError(
f"Failed to verify component '{graph_name}': "
f"{comp_resp.status_code}: {comp_resp.text}"
)
if not comp_exists:
comp_payload = {
"name": graph_name,
"description": "Registered by Dynamo's KubernetesDeploymentManager",
}
resp = session.post(comp_url, json=comp_payload, timeout=REQUEST_TIMEOUT)
if resp.status_code not in (200, 201, 409):
raise RuntimeError(f"Failed to create component: {resp.text}")
# Check if version exists before POST
ver_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions"
ver_get_url = (
f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions/{graph_version}"
)
ver_exists = False
ver_resp = session.get(ver_get_url, timeout=REQUEST_TIMEOUT)
if ver_resp.status_code == 200:
ver_exists = True
if not ver_exists:
build_at = kwargs.get("build_at")
if not build_at:
build_at = datetime.utcnow()
if isinstance(build_at, str):
try:
build_at = datetime.fromisoformat(build_at)
except Exception:
build_at = datetime.utcnow()
manifest = {
"service": entry_service.service_name,
"apis": entry_service.apis,
"size_bytes": entry_service.size_bytes,
}
ver_payload = {
"name": entry_service.name,
"description": f"Auto-registered version for {graph}",
"resource_type": "dynamo_component_version",
"version": graph_version,
"manifest": manifest,
"build_at": build_at.isoformat(),
}
resp = session.post(ver_url, json=ver_payload, timeout=REQUEST_TIMEOUT)
if resp.status_code not in (200, 201, 409):
raise RuntimeError(f"Failed to create component version: {resp.text}")
# Upload the graph
build_dir = entry_service.path
if not build_dir or not os.path.isdir(build_dir):
raise FileNotFoundError(f"Built graph directory not found: {build_dir}")
tar_stream = io.BytesIO()
with tarfile.open(fileobj=tar_stream, mode="w") as tar:
tar.add(build_dir, arcname=".")
tar_stream.seek(0)
upload_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions/{graph_version}/upload"
upload_headers = {"Content-Type": "application/x-tar"}
resp = session.put(
upload_url,
data=tar_stream,
headers=upload_headers,
timeout=REQUEST_TIMEOUT,
)
if resp.status_code not in (200, 201, 204):
raise RuntimeError(f"Failed to upload graph artifact: {resp.text}")
def get_capi_library_path() -> str:
"""
Get the path to the libdynamo_llm_capi.so library.
First checks the VLLM_KV_CAPI_PATH environment variable.
If not set, returns the path where the library is installed by the wheel.
Returns:
The path to the library.
"""
# First check environment variable
env_path = os.environ.get("VLLM_KV_CAPI_PATH")
if env_path:
return env_path
# Fall back to the installed location
# The library is installed at dynamo/sdk/cli/bin/libdynamo_llm_capi.so
import dynamo.sdk
sdk_path = os.path.dirname(dynamo.sdk.__file__)
lib_path = os.path.join(sdk_path, "cli", "bin", "libdynamo_llm_capi.so")
return lib_path
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Middle:
ServiceArgs:
workers: 2
resources:
cpu: "1"
Backend:
ServiceArgs:
workers: 3
resources:
cpu: "1"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a simple example of a pipeline that uses Dynamo to deploy a backend, middle, and frontend service.
# Use this to test changes made to CLI, SDK, etc
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from dynamo.sdk import depends, endpoint, service
from dynamo.sdk.core.protocol.interface import DynamoTransport
"""
Pipeline Architecture:
Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Backend │
└─────────────┘
"""
class RequestType(BaseModel):
text: str
class ResponseType(BaseModel):
text: str
GPU_ENABLED = False
@service(
resources={"cpu": "1"},
traffic={"timeout": 30},
dynamo={
"namespace": "inference",
},
workers=1,
)
class Backend:
def __init__(self) -> None:
print("Starting backend")
@endpoint()
async def generate(self, req: RequestType):
"""Generate tokens."""
req_text = req.text
print(f"Backend received: {req_text}")
text = f"{req_text}-back"
for token in text.split():
yield f"Backend: {token}"
@endpoint()
async def generate_v2(self, req: RequestType):
"""Generate tokens."""
req_text = req.text
print(f"Backend received: {req_text}")
text = f"{req_text}-back"
for token in text.split():
yield f"Backend generate_v2: {token}"
@service(
resources={"cpu": "2"},
traffic={"timeout": 30},
dynamo={"namespace": "inference"},
)
class Backend2:
backend = depends(Backend)
def __init__(self) -> None:
print("Starting backend2")
@endpoint()
async def generate(self, req: RequestType):
"""Forward requests to backend."""
req_text = req.text
print(f"Backend2 received: {req_text}")
text = f"{req_text}-back2"
next_request = RequestType(text=text).model_dump_json()
print(next_request)
@service(
resources={"cpu": "1"},
traffic={"timeout": 30},
dynamo={"namespace": "inference"},
)
class Middle:
backend = depends(Backend)
backend2 = depends(Backend2)
def __init__(self) -> None:
print("Starting middle")
@endpoint()
async def generate(self, req: RequestType):
"""Forward requests to backend."""
req_text = req.text
print(f"Middle received: {req_text}")
text = f"{req_text}-mid"
txt = RequestType(text=text)
if self.backend:
async for back_resp in self.backend.generate(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
async for back_resp in self.backend.generate_v2(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
else:
async for back_resp in self.backend2.generate(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
@service(
resources={"cpu": "1"},
traffic={"timeout": 60},
dynamo={"namespace": "inference"},
)
class Frontend:
middle = depends(Middle)
backend = depends(Backend)
def __init__(self) -> None:
print("Starting frontend")
@endpoint(transports=[DynamoTransport.HTTP])
async def generate(self, request: RequestType):
"""Stream results from the pipeline."""
print(f"Frontend received: {request.text}")
async def content_generator():
async for response in self.middle.generate(request.model_dump_json()):
yield f"Frontend: {response}"
return StreamingResponse(content_generator())
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pytest
from dynamo.sdk.lib.config import ServiceConfig
pytestmark = pytest.mark.pre_merge
def test_service_config_with_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that includes common-configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true,
"common-configs": ["model", "block-size", "max-model-len"]
}
}
"""
# Get arguments and verify common configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each common config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
def test_service_config_without_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that DOESN'T include common-configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true
}
}
"""
# Get arguments and verify common configs are NOT included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that none of the common configs appear in arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" not in vllm_worker_args
def test_service_config_with_direct_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with direct configs (no Common section reference)
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"VllmWorker": {
"enforce-eager": true,
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
}
}
"""
# Get arguments and verify direct configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
def test_service_config_override_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that includes common-configs
# overridden by the subscribing config
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true,
"block-size": 128,
"common-configs": ["model", "block-size", "max-model-len"]
}
}
"""
# Get arguments and verify common configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each common config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
assert vllm_worker_args[vllm_worker_args.index("--block-size") + 1] == "128"
def test_explicit_boolean_arguments():
"""Test that boolean arguments are handled correctly with new logic"""
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with boolean configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"VllmWorker": {
"enable-prefix-caching": true,
"disable-sliding-window": false,
"enforce-eager": true
}
}
"""
# Get arguments and verify boolean handling
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that true values are passed as flags only
assert "--enable-prefix-caching" in vllm_worker_args
# Should NOT have a following "true" value
enable_idx = vllm_worker_args.index("--enable-prefix-caching")
assert (
enable_idx == len(vllm_worker_args) - 1
or not vllm_worker_args[enable_idx + 1] == "true"
)
# Check that false values for standard boolean flags are omitted
assert "--disable-sliding-window" not in vllm_worker_args
# Check that another true value works as flag
assert "--enforce-eager" in vllm_worker_args
enforce_idx = vllm_worker_args.index("--enforce-eager")
assert (
enforce_idx == len(vllm_worker_args) - 1
or not vllm_worker_args[enforce_idx + 1] == "true"
)
def test_vllm_boolean_arguments_special_handling():
"""Test that vLLM boolean arguments with special defaults are handled correctly"""
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with vLLM boolean configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"VllmWorker": {
"enable-prefix-caching": false,
"use-tqdm-on-load": false,
"multi-step-stream-outputs": false,
"some-other-flag": false
}
}
"""
# Get arguments and verify vLLM special boolean handling
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that enable-prefix-caching false uses negative flag
assert "--no-enable-prefix-caching" in vllm_worker_args
assert "--enable-prefix-caching" not in vllm_worker_args
# Check that use-tqdm-on-load false uses negative flag
assert "--no-use-tqdm-on-load" in vllm_worker_args
assert "--use-tqdm-on-load" not in vllm_worker_args
# Check that multi-step-stream-outputs false uses negative flag
assert "--no-multi-step-stream-outputs" in vllm_worker_args
assert "--multi-step-stream-outputs" not in vllm_worker_args
# Check that other false flags are omitted (standard behavior)
assert "--some-other-flag" not in vllm_worker_args
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import subprocess
import time
import pytest
from typer.testing import CliRunner
from dynamo.sdk.cli.cli import cli
runner = CliRunner()
@pytest.fixture(scope="module", autouse=True)
def setup_and_teardown():
# Setup code
nats_server = subprocess.Popen(["nats-server", "-js"])
etcd = subprocess.Popen(["etcd"])
print("Setting up resources")
# Run the serve command in dry-run mode with CLI runner to check it's working
result = runner.invoke(
cli,
[
"serve",
"pipeline:Frontend",
"--working-dir",
"deploy/sdk/src/dynamo/sdk/tests",
"--Frontend.model=qwentastic",
"--Middle.bias=0.5",
"--dry-run",
],
)
# Now start the actual server using subprocess for the real integration test
server = subprocess.Popen(
[
"dynamo",
"serve",
"pipeline:Frontend",
"--working-dir",
"deploy/sdk/src/dynamo/sdk/tests",
"--Frontend.model=qwentastic",
"--Middle.bias=0.5",
]
)
time.sleep(5)
yield result
# Teardown code
print("Tearing down resources")
server.terminate()
server.wait()
nats_server.terminate()
nats_server.wait()
etcd.terminate()
etcd.wait()
async def test_pipeline(setup_and_teardown):
# Check the CLI command ran successfully
result = setup_and_teardown
assert result.exit_code == 0
# Clean the output to check for expected content
clean_output = re.sub(r"\x1b\[[0-9;]*m", "", result.output)
assert "Service Configuration:" in clean_output
assert '"Frontend": {' in clean_output
assert '"model": "qwentastic"' in clean_output
import asyncio
import aiohttp
max_retries = 5
for attempt in range(max_retries):
try:
async with aiohttp.ClientSession() as session:
async with session.post(
"http://localhost:8000/generate",
json={"text": "federer-is-the-greatest-tennis-player-of-all-time"},
headers={"accept": "text/event-stream"},
) as resp:
assert resp.status == 200
text = await resp.text()
assert (
"federer-is-the-greatest-tennis-player-of-all-time-mid-back"
in text
)
break
except Exception as e:
if attempt == max_retries - 1:
raise
print(f"Attempt {attempt + 1} failed, retrying... {e}")
await asyncio.sleep(3)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import subprocess
import time
from collections import Counter
import pytest
from typer.testing import CliRunner
from dynamo.sdk.cli.cli import cli
runner = CliRunner()
@pytest.fixture(scope="module", autouse=True)
def setup_and_teardown():
# Setup code
nats_server = subprocess.Popen(["nats-server", "-js"])
etcd = subprocess.Popen(["etcd"])
print("Setting up resources")
# Run the serve command in dry-run mode with CLI runner to check it's working
result = runner.invoke(
cli,
[
"serve",
"pipeline:Frontend",
"--working-dir",
"deploy/sdk/src/dynamo/sdk/tests",
"-f",
"deploy/sdk/src/dynamo/sdk/tests/config.yaml",
"--dry-run",
],
)
# Now start the actual server using subprocess for the real integration test
server = subprocess.Popen(
[
"dynamo",
"serve",
"pipeline:Frontend",
"--working-dir",
"deploy/sdk/src/dynamo/sdk/tests",
"-f",
"deploy/sdk/src/dynamo/sdk/tests/config.yaml",
]
)
time.sleep(3)
yield result
# Teardown code
print("Tearing down resources")
server.terminate()
server.wait()
nats_server.terminate()
nats_server.wait()
etcd.terminate()
etcd.wait()
async def test_pipeline(setup_and_teardown):
import asyncio
import aiohttp
# Check the CLI command ran successfully
result = setup_and_teardown
assert result.exit_code == 0
import psutil
# Capture list of subprocesses (children of current process)
current_process = psutil.Process()
child_processes = list(current_process.children(recursive=True))
# Assert their name and command line
service_count: Counter[str] = Counter()
for proc in child_processes:
try:
cmd = proc.cmdline()
if "--service-name" in " ".join(cmd):
idx = cmd.index("--service-name")
service_name = cmd[idx + 1]
service_count[service_name] += 1
# assert "dynamo" in name.lower() or "dynamo" in " ".join(cmdline).lower()
# assert any("serve" in arg for arg in cmdline)
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
assert service_count["Frontend"] == 1
assert service_count["Backend"] == 3
assert service_count["Middle"] == 2
# Clean the output to check for expected content
clean_output = re.sub(r"\x1b\[[0-9;]*m", "", result.output)
assert "Service Configuration:" in clean_output
max_retries = 5
for attempt in range(max_retries):
try:
async with aiohttp.ClientSession() as session:
async with session.post(
"http://localhost:8000/generate",
json={"text": "federer-is-the-greatest-tennis-player-of-all-time"},
headers={"accept": "text/event-stream"},
) as resp:
assert resp.status == 200
text = await resp.text()
assert (
"federer-is-the-greatest-tennis-player-of-all-time-mid-back"
in text
)
break
except Exception as e:
if attempt == max_retries - 1:
raise
print(f"Attempt {attempt + 1} failed, retrying... {e}")
await asyncio.sleep(3)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dynamo.sdk.core.protocol.interface import LinkedServices
def test_remove_backend2():
from dynamo.sdk.tests.pipeline import Backend, Backend2, Frontend, Middle
# Initial state assertions
assert set(Frontend.dependencies.keys()) == {"backend", "middle"}
assert Frontend.dependencies["backend"].on == Backend
assert Frontend.dependencies["middle"].on == Middle
assert set(Middle.dependencies.keys()) == {"backend", "backend2"}
assert Middle.dependencies["backend"].on == Backend
assert Middle.dependencies["backend2"].on == Backend2
assert Backend.dependencies == {}
Frontend.link(Middle).link(Backend)
LinkedServices.remove_unused_edges()
# Final state assertions after linking and cleanup
assert Frontend.dependencies["middle"].on == Middle
assert set(Frontend.dependencies.keys()) == {"middle"}
assert set(Middle.dependencies.keys()) == {"backend"}
assert Middle.dependencies["backend"].on == Backend
assert Backend.dependencies == {}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from dynamo.sdk.cli.utils import configure_target_environment
from dynamo.sdk.core.protocol.interface import ServiceInterface
from dynamo.sdk.core.runner import TargetEnum
@pytest.fixture(scope="module", autouse=True)
def setup_and_teardown():
configure_target_environment(TargetEnum.DYNAMO)
yield
def test_gpu_resources(setup_and_teardown):
"""Test resource configurations"""
from dynamo.sdk import service
@service(
resources={"cpu": "2", "gpu": "1", "memory": "4Gi"},
dynamo={"namespace": "test"},
)
class MyService:
def __init__(self) -> None:
pass
dyn_svc: ServiceInterface = MyService
assert dyn_svc.config is not None # type: ignore
assert dyn_svc.config.resources.cpu == "2"
assert dyn_svc.config.resources.gpu == "1"
assert dyn_svc.config.resources.memory == "4Gi"
def test_gpu_resources_coercing_from_integers(setup_and_teardown):
"""Test resource configurations"""
from dynamo.sdk import service
@service(
resources={"cpu": 3, "gpu": 4, "memory": "4Gi"},
dynamo={"namespace": "test"},
)
class MockService:
def __init__(self) -> None:
pass
dyn_svc: ServiceInterface = MockService
assert dyn_svc.config is not None # type: ignore
assert dyn_svc.config.resources.cpu == "3"
assert dyn_svc.config.resources.gpu == "4"
assert dyn_svc.config.resources.memory == "4Gi"
#!/bin/bash
#!/bin/bash -e
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
export DYNAMO_CLOUD="${DYNAMO_CLOUD:-http://dynamo-cloud}"
export DYNAMO_IMAGE="${DYNAMO_IMAGE:-dynamo-base:latest}"
export DEPLOYMENT_NAME="${DEPLOYMENT_NAME:-ci-hw}"
cd /workspace/examples/hello_world
# Step.1: Login to dynamo cloud
dynamo cloud login $DYNAMO_CLOUD
# Step.2: build a dynamo nim with framework-less base
DYNAMO_TAG=$(dynamo build hello_world:Frontend | grep "Successfully built" | awk -F"\"" '{ print $2 }')
# Step.3: Deploy!
echo $DYNAMO_TAG
# TODO: Deploy your service using a DynamoGraphDeployment CR.
......@@ -34,7 +34,7 @@ For example, the deployment configuration `examples/llm/configs/disagg.yaml` hav
- `Processor`: When a new request arrives, `Processor` applies the chat template and perform the tokenization. Then, it route the request to the `VllmWorker`.
- `VllmWorker` and `PrefillWorker`: Perform the actual decode and prefill computation.
Since the four workers are deployed in different processes, each of them have their own `DistributedRuntime`. Within their own `DistributedRuntime`, they all have their own `Namespace`s named `dynamo`. Then, under their own `dynamo` namespace, they have their own `Component`s named `Frontend/Processor/VllmWorker/PrefillWorker`. Lastly, for the `Endpoint`, `Frontend` has no `Endpoints`, `Processor` and `VllmWorker` each has a `generate` endpoint, and `PrefillWorker` has a placeholder `mock` endpoint. Their `DistributedRuntime`s and `Namespace`s are set in the `@service` decorators in `examples/llm/components/<frontend/processor/worker/prefill_worker>.py`. Their `Component`s are set by their name in `/deploy/dynamo/sdk/src/dynamo/sdk/cli/serve_dynamo.py`. Their `Endpoint`s are set by the `@endpoint` decorators in `examples/llm/components/<frontend/processor/worker/prefill_worker>.py`.
Since the four workers are deployed in different processes, each of them have their own `DistributedRuntime`. Within their own `DistributedRuntime`, they all have their own `Namespace`s named `dynamo`. Then, under their own `dynamo` namespace, they have their own `Component`s named `Frontend/Processor/VllmWorker/PrefillWorker`. Lastly, for the `Endpoint`, `Frontend` has no `Endpoints`, `Processor` and `VllmWorker` each has a `generate` endpoint, and `PrefillWorker` has a placeholder `mock` endpoint.
## Initialization
......
......@@ -24,8 +24,6 @@
**Dynamo Cloud** - A Kubernetes platform providing managed deployment experience for Dynamo inference graphs.
## E
**@endpoint** - A Python decorator used to define service endpoints within a Dynamo component.
**Endpoint** - A specific network-accessible API within a Dynamo component, such as `generate` or `load_metrics`.
## F
......@@ -70,8 +68,6 @@
**RDMA (Remote Direct Memory Access)** - Technology that allows direct memory access between distributed systems, used for efficient KV cache transfers.
## S
**@service** - Python decorator used to define a Dynamo service class.
**SGLang** - Fast LLM inference framework with native embedding support and RadixAttention.
## T
......
......@@ -90,7 +90,6 @@ build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = [
"deploy/sdk/src/dynamo",
"components/frontend/src/dynamo",
"components/planner/src/dynamo",
"components/backends/llama_cpp/src/dynamo",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment