feat: deprecate sdk as dependency (#2149)

47477909 · Biswa Panda · GitHub · 095ea3e7 · 095ea3e7 · 095ea3e7
Unverified Commit 47477909 authored Jul 28, 2025 by Biswa Panda Committed by GitHub Jul 29, 2025
14 changed files
--- a/deploy/sdk/src/dynamo/sdk/lib/loader.py
+++ b/deploy/sdk/src/dynamo/sdk/lib/loader.py
-#  SPDX-FileCopyrightText: Copyright (c) 2020 Atalaya Tech. Inc
-#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
-#  #
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#  #
-#  http://www.apache.org/licenses/LICENSE-2.0
-#  #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#  Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
-
-from __future__ import annotations
-
-import importlib
-import logging
-import os
-import sys
-from typing import Optional, TypeVar
-
-import yaml
-
-from dynamo.sdk.core.protocol.deployment import Service
-from dynamo.sdk.core.protocol.interface import ServiceInterface
-
-logger = logging.getLogger(__name__)
-T = TypeVar("T", bound=object)
-
-
-def find_and_load_service(
-    import_str: str,
-    working_dir: Optional[str] = None,
-) -> ServiceInterface:
-    """Load a DynamoService instance from source code by providing an import string.
-
-    Args:
-        import_str: String in format "module[:attribute]" or "path/to/file.py[:attribute]"
-            Examples:
-                "graphs:disagg:Frontend"
-                "fraud_detector:svc"
-                "./path/to/service.py:MyService"
-                "fraud_detector"  # Will find the root service if only one exists
-        working_dir: Optional directory to use as base for imports. Defaults to cwd.
-
-    Returns:
-        The loaded DynamoService instance
-
-    Raises:
-        ImportError: If module cannot be imported
-        ValueError: If service cannot be found or multiple root services exist
-    """
-    logger.debug(f"Loading service from import string: {import_str}")
-    logger.debug(f"Working directory: {working_dir or os.getcwd()}")
-
-    sys_path_modified = False
-    prev_cwd = None
-
-    if working_dir is not None:
-        prev_cwd = os.getcwd()
-        working_dir = os.path.realpath(os.path.expanduser(working_dir))
-        logger.debug(f"Changing working directory to: {working_dir}")
-        os.chdir(working_dir)
-    else:
-        working_dir = os.getcwd()
-
-    if working_dir not in sys.path:
-        logger.debug(f"Adding {working_dir} to sys.path")
-        sys.path.insert(0, working_dir)
-        sys_path_modified = True
-
-    try:
-        return _do_import(import_str, working_dir)
-    finally:
-        if sys_path_modified and working_dir:
-            logger.debug(f"Removing {working_dir} from sys.path")
-            sys.path.remove(working_dir)
-        if prev_cwd is not None:
-            logger.debug(f"Restoring working directory to: {prev_cwd}")
-            os.chdir(prev_cwd)
-
-
-def _do_import(import_str: str, working_dir: str) -> ServiceInterface:
-    """Internal function to handle the actual import logic"""
-    import_path, _, attrs_str = import_str.partition(":")
-    logger.debug(f"Parsed import string - path: {import_path}, attributes: {attrs_str}")
-
-    if not import_path:
-        raise ValueError(
-            f'Invalid import string "{import_str}", must be in format '
-            '"<module>:<attribute>" or "<module>"'
-        )
-
-    # Handle file path vs module name imports
-    if os.path.isfile(import_path):
-        logger.debug(f"Importing from file path: {import_path}")
-        import_path = os.path.realpath(import_path)
-        if not import_path.startswith(working_dir):
-            raise ImportError(
-                f'Module "{import_path}" not found in working directory "{working_dir}"'
-            )
-
-        file_name, ext = os.path.splitext(import_path)
-        if ext != ".py":
-            raise ImportError(
-                f'Invalid module extension "{ext}", only ".py" files are supported'
-            )
-
-        # Build module name from path components
-        module_parts = []
-        path = file_name
-        while True:
-            path, name = os.path.split(path)
-            module_parts.append(name)
-            if (
-                not os.path.exists(os.path.join(path, "__init__.py"))
-                or path == working_dir
-            ):
-                break
-        module_name = ".".join(module_parts[::-1])
-        logger.debug(f"Constructed module name from path: {module_name}")
-    else:
-        logger.debug(f"Importing from module name: {import_path}")
-        module_name = import_path
-
-    try:
-        logger.debug(f"Attempting to import module: {module_name}")
-        module = importlib.import_module(module_name)
-    except ImportError as e:
-        raise ImportError(f'Failed to import module "{module_name}": {e}')
-
-    # If no specific attribute given, find the root service
-    if not attrs_str:
-        logger.debug("No attributes specified, searching for root service")
-        services = [
-            (name, obj)
-            for name, obj in module.__dict__.items()
-            if isinstance(obj, ServiceInterface)
-        ]
-        logger.debug(f"Found {len(services)} DynamoService instances")
-
-        if not services:
-            raise ValueError(
-                f"No DynamoService instances found in module '{module_name}'"
-            )
-
-        # Find root services (those that aren't dependencies of other services)
-        dependents = set()
-        for _, svc in services:
-            for dep in svc.dependencies.values():
-                if dep.on is not None:
-                    dependents.add(dep.on)
-
-        root_services = [(n, s) for n, s in services if s not in dependents]
-        logger.debug(f"Found {len(root_services)} root services")
-
-        if not root_services:
-            raise ValueError(
-                f"No root DynamoService found in module '{module_name}'. "
-                "All services are dependencies of other services."
-            )
-        if len(root_services) > 1:
-            names = [n for n, _ in root_services]
-            raise ValueError(
-                f"Multiple root services found in module '{module_name}': {names}. "
-                "Please specify which service to use with '<module>:<service_name>'"
-            )
-
-        _, instance = root_services[0]
-        logger.debug(f"Selected root service: {instance}")
-    else:
-        # Navigate through dot-separated attributes
-        logger.debug(f"Navigating attributes: {attrs_str}")
-        instance = module
-        for attr in attrs_str.split("."):
-            try:
-                if isinstance(instance, ServiceInterface):
-                    logger.debug(f"Following dependency link: {attr}")
-                    instance = instance.dependencies[attr].on
-                else:
-                    logger.debug(f"Getting attribute: {attr}")
-                    instance = getattr(instance, attr)
-            except (AttributeError, KeyError):
-                raise ValueError(f'Attribute "{attr}" not found in "{module_name}"')
-
-    # Set import string for debugging/logging
-    if not hasattr(instance, "_import_str"):
-        import_str_val = f"{module_name}:{attrs_str}" if attrs_str else module_name
-        logger.debug(f"Setting _import_str to: {import_str_val}")
-        object.__setattr__(instance, "_import_str", import_str_val)
-
-    return instance
-
-
-def _get_dir_size(path: str) -> int:
-    total = 0
-    for dirpath, _, filenames in os.walk(path):
-        for f in filenames:
-            fp = os.path.join(dirpath, f)
-            if os.path.isfile(fp):
-                total += os.path.getsize(fp)
-    logger.debug(f"Total size of {path}: {total} bytes")
-    return total
-
-
-def load_entry_service(
-    graph_tag: str, build_dir: str = "~/.dynamo/packages"
-) -> Service:
-    """
-    Given a built graph tag (e.g. frontend:2uk2fwzvqsswvs7t), load the entry service as a deployment Service instance.
-    """
-    if ":" not in graph_tag:
-        raise ValueError("graph_tag must be in the form name:version")
-    name, version = graph_tag.split(":", 1)
-    graph_dir = os.path.expanduser(f"{build_dir}/{name}/{version}")
-    if not os.path.isdir(graph_dir):
-        raise FileNotFoundError(f"Graph directory not found: {graph_dir}")
-
-    config_path = os.path.join(graph_dir, "dynamo.yaml")
-    if not os.path.isfile(config_path):
-        raise FileNotFoundError(f"Graph config (dynamo.yaml) not found in {graph_dir}")
-    with open(config_path, encoding="utf-8") as f:
-        graph_cfg = yaml.safe_load(f)
-
-    # Add src_dir to sys.path if needed
-    src_dir = os.path.join(graph_dir, "src")
-    if src_dir not in sys.path:
-        sys.path.insert(0, src_dir)
-
-    # Compute size_bytes as the total size of the dynamo directory
-    size_bytes = _get_dir_size(graph_dir)
-
-    service_name = graph_cfg.get("service")
-    for svc in graph_cfg.get("services", []):
-        svc_name = svc["name"]
-        if svc_name != graph_cfg.get("entry_service"):
-            continue
-        entry_service = Service(
-            service_name=service_name,
-            name=svc_name,
-            namespace=svc.get("dynamo", {}).get("namespace", "default"),
-            version=version,
-            path=graph_dir,
-            envs=graph_cfg.get("envs", []),
-            apis={},
-            size_bytes=size_bytes,
-        )
-        return entry_service
-    raise ValueError("No entry service found in the graph")
--- a/deploy/sdk/src/dynamo/sdk/lib/resource.py
+++ b/deploy/sdk/src/dynamo/sdk/lib/resource.py
-#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
-#  #
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#  #
-#  http://www.apache.org/licenses/LICENSE-2.0
-#  #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# TODO: this should be used for planner as well and should leverage proper nvml bindings
-
-from __future__ import annotations
-
-import logging
-import typing as t
-from dataclasses import dataclass
-
-import psutil
-
-try:
-    import pynvml
-
-    PYNVML_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    PYNVML_AVAILABLE = False
-
-logger = logging.getLogger(__name__)
-
-# Constants
-NVIDIA_GPU = "nvidia.com/gpu"
-
-
-class ResourceError(Exception):
-    """Base exception for resource-related errors."""
-
-    pass
-
-
-@dataclass
-class GPUProcess:
-    """Information about a process running on a GPU."""
-
-    pid: int
-    used_memory: int  # in bytes
-    name: str = ""
-
-    def __post_init__(self):
-        """Get process name if available."""
-        try:
-            self.name = psutil.Process(self.pid).name()
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            pass
-
-
-class GPUInfo:
-    """Information about a specific GPU device."""
-
-    def __init__(self, index: int, total_memory: int, name: str, uuid: str):
-        self.index = index
-        self.total_memory = total_memory  # in bytes
-        self.name = name
-        self.uuid = uuid
-        self.available = True  # Can be set to False if GPU is reserved/in use
-        self.utilization = 0  # in percent (0-100)
-        self.processes: list[GPUProcess] = []
-
-    def __repr__(self) -> str:
-        return f"GPUInfo(index={self.index}, name='{self.name}', total_memory={self.total_memory/1024/1024:.0f}MB, available={self.available})"
-
-
-class GPUManager:
-    """
-    Manages GPU resources using NVML.
-
-    This class provides methods to:
-    - Discover available GPUs
-    - Query GPU properties and status
-    - Track GPU processes
-    - Allocate and release GPUs
-    - Generate CUDA_VISIBLE_DEVICES environment variables
-    """
-
-    def __init__(self):
-        """Initialize the GPU manager."""
-        self.gpus: list[GPUInfo] = []
-        self._initialized = False
-        # List to track fractional GPU allocations
-        # Each item is (gpu_index, fraction_used, fraction_size)
-        # E.g. (0, 0.5, 0.5) means GPU 0 has 0.5 used with fraction size of 0.5
-        self._gpu_fractions: list[tuple[int, float, float]] = []
-        self._init_nvml()
-
-    def _init_nvml(self):
-        """Initialize NVML and discover GPUs."""
-        if not PYNVML_AVAILABLE:
-            logger.warning("PyNVML not available. GPU functionality will be limited.")
-            return
-
-        try:
-            pynvml.nvmlInit()
-            self._initialized = True
-            self._discover_gpus()
-        except (
-            pynvml.NVMLError_LibraryNotFound,
-            pynvml.NVMLError_DriverNotLoaded,
-            OSError,
-        ) as e:
-            logger.warning(f"Failed to initialize NVML: {e}")
-            self._initialized = False
-
-    def __del__(self):
-        """Clean up NVML."""
-        if self._initialized:
-            try:
-                pynvml.nvmlShutdown()
-            except Exception:  # pylint: disable=broad-except
-                pass
-
-    def _discover_gpus(self):
-        """Discover available GPUs and their properties."""
-        if not self._initialized:
-            return
-
-        try:
-            device_count = pynvml.nvmlDeviceGetCount()
-            self.gpus = []
-
-            for i in range(device_count):
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                name = pynvml.nvmlDeviceGetName(handle)
-                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-                uuid = pynvml.nvmlDeviceGetUUID(handle)
-
-                gpu_info = GPUInfo(
-                    index=i, total_memory=memory_info.total, name=name, uuid=uuid
-                )
-
-                try:
-                    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-                    gpu_info.utilization = utilization.gpu
-                except pynvml.NVMLError:
-                    logger.debug(f"Could not get utilization for GPU {i}")
-
-                # Get processes running on GPU
-                try:
-                    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
-                    gpu_info.processes = [
-                        GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory)
-                        for p in processes
-                    ]
-                except pynvml.NVMLError:
-                    logger.debug(f"Could not get processes for GPU {i}")
-
-                self.gpus.append(gpu_info)
-
-            logger.info(f"Discovered {len(self.gpus)} GPUs")
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error discovering GPUs: {e}")
-
-    def update_gpu_stats(self):
-        """Update GPU statistics (utilization, memory etc.)."""
-        if not self._initialized:
-            return
-
-        for gpu in self.gpus:
-            try:
-                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu.index)
-
-                # Update memory info
-                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-                gpu.total_memory = memory_info.total
-
-                # Update utilization
-                try:
-                    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-                    gpu.utilization = utilization.gpu
-                except pynvml.NVMLError:
-                    pass
-
-                # Update processes
-                try:
-                    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
-                    gpu.processes = [
-                        GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory)
-                        for p in processes
-                    ]
-                except pynvml.NVMLError:
-                    pass
-
-            except pynvml.NVMLError as e:
-                logger.warning(f"Error updating GPU {gpu.index} stats: {e}")
-
-    def get_gpu_count(self) -> int:
-        """Return the number of available GPUs."""
-        return len(self.gpus)
-
-    def get_available_gpus(self) -> list[int]:
-        """Return a list of available GPU indices."""
-        return [gpu.index for gpu in self.gpus if gpu.available]
-
-    def get_gpu_memory(self, index: int) -> tuple[int, int]:
-        """
-        Return (total memory, free memory) in bytes for a specific GPU.
-
-        Args:
-            index: GPU index
-
-        Returns:
-            Tuple of (total memory, free memory) in bytes
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return (0, 0)
-
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            return (memory_info.total, memory_info.free)
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
-            return (0, 0)
-
-    def reset_allocations(self):
-        """Reset all GPU allocations."""
-        self._gpu_fractions = []
-        for gpu in self.gpus:
-            gpu.available = True
-
-    def get_gpu_stats(self) -> list[dict[str, t.Any]]:
-        """
-        Get detailed statistics for all GPUs.
-
-        Returns:
-            List of dictionaries with GPU statistics
-        """
-        self.update_gpu_stats()
-
-        stats = []
-        for gpu in self.gpus:
-            total_memory, free_memory = self.get_gpu_memory(gpu.index)
-            stats.append(
-                {
-                    "index": gpu.index,
-                    "name": gpu.name,
-                    "uuid": gpu.uuid,
-                    "total_memory": total_memory,
-                    "free_memory": free_memory,
-                    "used_memory": total_memory - free_memory,
-                    "memory_utilization": (total_memory - free_memory)
-                    / total_memory
-                    * 100
-                    if total_memory > 0
-                    else 0,
-                    "gpu_utilization": gpu.utilization,
-                    "process_count": len(gpu.processes),
-                    "processes": [
-                        {
-                            "pid": process.pid,
-                            "name": process.name,
-                            "used_memory": process.used_memory,
-                        }
-                        for process in gpu.processes
-                    ],
-                    "available": gpu.available,
-                }
-            )
-
-        return stats
-
-
-def system_resources() -> dict[str, t.Any]:
-    """
-    Get available GPU resources
-
-    Returns:
-        Dictionary of resources with keys 'nvidia.com/gpu'
-    """
-    resources = {}
-
-    # Get GPU resources
-    gpu_manager = GPUManager()
-    resources[NVIDIA_GPU] = gpu_manager.get_available_gpus()
-
-    return resources
--- a/deploy/sdk/src/dynamo/sdk/lib/utils.py
+++ b/deploy/sdk/src/dynamo/sdk/lib/utils.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import os
-import tarfile
-from datetime import datetime
-from typing import Optional
-
-import requests
-
-from dynamo.sdk.core.protocol.deployment import Service
-
-REQUEST_TIMEOUT = 20
-
-
-def get_host_port():
-    """Gets host and port from environment variables. Defaults to 0.0.0.0:8000."""
-    port = int(os.environ.get("DYNAMO_PORT", 8000))
-    host = os.environ.get("DYNAMO_HOST", "0.0.0.0")
-    return host, port
-
-
-def get_system_app_host_port():
-    """Gets host and port for system app from environment variables. Defaults to choosing a random port."""
-    port = int(os.environ.get("DYNAMO_SYSTEM_APP_PORT", 0))
-    host = os.environ.get("DYNAMO_SYSTEM_APP_HOST", "0.0.0.0")
-    return host, port
-
-
-def upload_graph(
-    endpoint: str,
-    graph: str,
-    entry_service: Service,
-    session: Optional[requests.Session] = None,
-    **kwargs,
-) -> None:
-    """Upload the entire graph as a single component/version, with a manifest of all services."""
-    session = session or requests.Session()
-    parts = graph.split(":")
-    if len(parts) != 2:
-        raise ValueError(
-            f"`graph` must be in '<name>:<version>' format, got '{graph}'."
-        )
-    graph_name, graph_version = parts
-
-    # Check if component exists before POST
-    comp_url = f"{endpoint}/api/v1/dynamo_components"
-    comp_get_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}"
-    comp_exists = False
-    comp_resp = session.get(comp_get_url, timeout=REQUEST_TIMEOUT)
-    if comp_resp.status_code == 200:
-        comp_exists = True
-    elif comp_resp.status_code == 404:
-        comp_exists = False
-    else:
-        raise RuntimeError(
-            f"Failed to verify component '{graph_name}': "
-            f"{comp_resp.status_code}: {comp_resp.text}"
-        )
-    if not comp_exists:
-        comp_payload = {
-            "name": graph_name,
-            "description": "Registered by Dynamo's KubernetesDeploymentManager",
-        }
-        resp = session.post(comp_url, json=comp_payload, timeout=REQUEST_TIMEOUT)
-        if resp.status_code not in (200, 201, 409):
-            raise RuntimeError(f"Failed to create component: {resp.text}")
-
-    # Check if version exists before POST
-    ver_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions"
-    ver_get_url = (
-        f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions/{graph_version}"
-    )
-    ver_exists = False
-    ver_resp = session.get(ver_get_url, timeout=REQUEST_TIMEOUT)
-    if ver_resp.status_code == 200:
-        ver_exists = True
-    if not ver_exists:
-        build_at = kwargs.get("build_at")
-        if not build_at:
-            build_at = datetime.utcnow()
-        if isinstance(build_at, str):
-            try:
-                build_at = datetime.fromisoformat(build_at)
-            except Exception:
-                build_at = datetime.utcnow()
-        manifest = {
-            "service": entry_service.service_name,
-            "apis": entry_service.apis,
-            "size_bytes": entry_service.size_bytes,
-        }
-        ver_payload = {
-            "name": entry_service.name,
-            "description": f"Auto-registered version for {graph}",
-            "resource_type": "dynamo_component_version",
-            "version": graph_version,
-            "manifest": manifest,
-            "build_at": build_at.isoformat(),
-        }
-        resp = session.post(ver_url, json=ver_payload, timeout=REQUEST_TIMEOUT)
-        if resp.status_code not in (200, 201, 409):
-            raise RuntimeError(f"Failed to create component version: {resp.text}")
-
-    # Upload the graph
-    build_dir = entry_service.path
-    if not build_dir or not os.path.isdir(build_dir):
-        raise FileNotFoundError(f"Built graph directory not found: {build_dir}")
-    tar_stream = io.BytesIO()
-    with tarfile.open(fileobj=tar_stream, mode="w") as tar:
-        tar.add(build_dir, arcname=".")
-    tar_stream.seek(0)
-    upload_url = f"{endpoint}/api/v1/dynamo_components/{graph_name}/versions/{graph_version}/upload"
-    upload_headers = {"Content-Type": "application/x-tar"}
-    resp = session.put(
-        upload_url,
-        data=tar_stream,
-        headers=upload_headers,
-        timeout=REQUEST_TIMEOUT,
-    )
-    if resp.status_code not in (200, 201, 204):
-        raise RuntimeError(f"Failed to upload graph artifact: {resp.text}")
-
-
-def get_capi_library_path() -> str:
-    """
-    Get the path to the libdynamo_llm_capi.so library.
-
-    First checks the VLLM_KV_CAPI_PATH environment variable.
-    If not set, returns the path where the library is installed by the wheel.
-
-    Returns:
-        The path to the library.
-    """
-    # First check environment variable
-    env_path = os.environ.get("VLLM_KV_CAPI_PATH")
-    if env_path:
-        return env_path
-
-    # Fall back to the installed location
-    # The library is installed at dynamo/sdk/cli/bin/libdynamo_llm_capi.so
-    import dynamo.sdk
-
-    sdk_path = os.path.dirname(dynamo.sdk.__file__)
-    lib_path = os.path.join(sdk_path, "cli", "bin", "libdynamo_llm_capi.so")
-
-    return lib_path
--- a/deploy/sdk/src/dynamo/sdk/tests/config.yaml
+++ b/deploy/sdk/src/dynamo/sdk/tests/config.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-Middle:
-  ServiceArgs:
-    workers: 2
-    resources:
-      cpu: "1"
-
-Backend:
-  ServiceArgs:
-    workers: 3
-    resources:
-      cpu: "1"
--- a/deploy/sdk/src/dynamo/sdk/tests/pipeline.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/pipeline.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a simple example of a pipeline that uses Dynamo to deploy a backend, middle, and frontend service.
-# Use this to test changes made to CLI, SDK, etc
-
-
-from fastapi.responses import StreamingResponse
-from pydantic import BaseModel
-
-from dynamo.sdk import depends, endpoint, service
-from dynamo.sdk.core.protocol.interface import DynamoTransport
-
-"""
-Pipeline Architecture:
-
-Users/Clients (HTTP)
-      │
-      ▼
-┌─────────────┐
-│  Frontend   │  HTTP API endpoint (/generate)
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│   Middle    │
-└─────────────┘
-      │ dynamo/runtime
-      ▼
-┌─────────────┐
-│  Backend    │
-└─────────────┘
-"""
-
-
-class RequestType(BaseModel):
-    text: str
-
-
-class ResponseType(BaseModel):
-    text: str
-
-
-GPU_ENABLED = False
-
-
-@service(
-    resources={"cpu": "1"},
-    traffic={"timeout": 30},
-    dynamo={
-        "namespace": "inference",
-    },
-    workers=1,
-)
-class Backend:
-    def __init__(self) -> None:
-        print("Starting backend")
-
-    @endpoint()
-    async def generate(self, req: RequestType):
-        """Generate tokens."""
-        req_text = req.text
-        print(f"Backend received: {req_text}")
-        text = f"{req_text}-back"
-        for token in text.split():
-            yield f"Backend: {token}"
-
-    @endpoint()
-    async def generate_v2(self, req: RequestType):
-        """Generate tokens."""
-        req_text = req.text
-        print(f"Backend received: {req_text}")
-        text = f"{req_text}-back"
-        for token in text.split():
-            yield f"Backend generate_v2: {token}"
-
-
-@service(
-    resources={"cpu": "2"},
-    traffic={"timeout": 30},
-    dynamo={"namespace": "inference"},
-)
-class Backend2:
-    backend = depends(Backend)
-
-    def __init__(self) -> None:
-        print("Starting backend2")
-
-    @endpoint()
-    async def generate(self, req: RequestType):
-        """Forward requests to backend."""
-
-        req_text = req.text
-        print(f"Backend2 received: {req_text}")
-        text = f"{req_text}-back2"
-        next_request = RequestType(text=text).model_dump_json()
-        print(next_request)
-
-
-@service(
-    resources={"cpu": "1"},
-    traffic={"timeout": 30},
-    dynamo={"namespace": "inference"},
-)
-class Middle:
-    backend = depends(Backend)
-    backend2 = depends(Backend2)
-
-    def __init__(self) -> None:
-        print("Starting middle")
-
-    @endpoint()
-    async def generate(self, req: RequestType):
-        """Forward requests to backend."""
-        req_text = req.text
-        print(f"Middle received: {req_text}")
-        text = f"{req_text}-mid"
-
-        txt = RequestType(text=text)
-
-        if self.backend:
-            async for back_resp in self.backend.generate(txt.model_dump_json()):
-                print(f"Frontend received back_resp: {back_resp}")
-                yield f"Frontend: {back_resp}"
-            async for back_resp in self.backend.generate_v2(txt.model_dump_json()):
-                print(f"Frontend received back_resp: {back_resp}")
-                yield f"Frontend: {back_resp}"
-        else:
-            async for back_resp in self.backend2.generate(txt.model_dump_json()):
-                print(f"Frontend received back_resp: {back_resp}")
-                yield f"Frontend: {back_resp}"
-
-
-@service(
-    resources={"cpu": "1"},
-    traffic={"timeout": 60},
-    dynamo={"namespace": "inference"},
-)
-class Frontend:
-    middle = depends(Middle)
-    backend = depends(Backend)
-
-    def __init__(self) -> None:
-        print("Starting frontend")
-
-    @endpoint(transports=[DynamoTransport.HTTP])
-    async def generate(self, request: RequestType):
-        """Stream results from the pipeline."""
-        print(f"Frontend received: {request.text}")
-
-        async def content_generator():
-            async for response in self.middle.generate(request.model_dump_json()):
-                yield f"Frontend: {response}"
-
-        return StreamingResponse(content_generator())
--- a/deploy/sdk/src/dynamo/sdk/tests/test_config.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/test_config.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from dynamo.sdk.lib.config import ServiceConfig
-
-pytestmark = pytest.mark.pre_merge
-
-
-def test_service_config_with_common_configs():
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with config that includes common-configs
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "Common": {
-            "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-            "block-size": 64,
-            "max-model-len": 16384
-        },
-        "VllmWorker": {
-            "enforce-eager": true,
-            "common-configs": ["model", "block-size", "max-model-len"]
-        }
-    }
-    """
-
-    # Get arguments and verify common configs are included
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that each common config appears in the arguments
-    for key in ["model", "block-size", "max-model-len"]:
-        assert f"--{key}" in vllm_worker_args
-
-
-def test_service_config_without_common_configs():
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with config that DOESN'T include common-configs
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "Common": {
-            "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-            "block-size": 64,
-            "max-model-len": 16384
-        },
-        "VllmWorker": {
-            "enforce-eager": true
-        }
-    }
-    """
-
-    # Get arguments and verify common configs are NOT included
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that none of the common configs appear in arguments
-    for key in ["model", "block-size", "max-model-len"]:
-        assert f"--{key}" not in vllm_worker_args
-
-
-def test_service_config_with_direct_configs():
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with direct configs (no Common section reference)
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "VllmWorker": {
-            "enforce-eager": true,
-            "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-            "block-size": 64,
-            "max-model-len": 16384
-        }
-    }
-    """
-
-    # Get arguments and verify direct configs are included
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that each config appears in the arguments
-    for key in ["model", "block-size", "max-model-len"]:
-        assert f"--{key}" in vllm_worker_args
-
-
-def test_service_config_override_common_configs():
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with config that includes common-configs
-    # overridden by the subscribing config
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "Common": {
-            "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-            "block-size": 64,
-            "max-model-len": 16384
-        },
-        "VllmWorker": {
-            "enforce-eager": true,
-            "block-size": 128,
-            "common-configs": ["model", "block-size", "max-model-len"]
-        }
-    }
-    """
-
-    # Get arguments and verify common configs are included
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that each common config appears in the arguments
-    for key in ["model", "block-size", "max-model-len"]:
-        assert f"--{key}" in vllm_worker_args
-
-    assert vllm_worker_args[vllm_worker_args.index("--block-size") + 1] == "128"
-
-
-def test_explicit_boolean_arguments():
-    """Test that boolean arguments are handled correctly with new logic"""
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with boolean configs
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "VllmWorker": {
-            "enable-prefix-caching": true,
-            "disable-sliding-window": false,
-            "enforce-eager": true
-        }
-    }
-    """
-
-    # Get arguments and verify boolean handling
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that true values are passed as flags only
-    assert "--enable-prefix-caching" in vllm_worker_args
-    # Should NOT have a following "true" value
-    enable_idx = vllm_worker_args.index("--enable-prefix-caching")
-    assert (
-        enable_idx == len(vllm_worker_args) - 1
-        or not vllm_worker_args[enable_idx + 1] == "true"
-    )
-
-    # Check that false values for standard boolean flags are omitted
-    assert "--disable-sliding-window" not in vllm_worker_args
-
-    # Check that another true value works as flag
-    assert "--enforce-eager" in vllm_worker_args
-    enforce_idx = vllm_worker_args.index("--enforce-eager")
-    assert (
-        enforce_idx == len(vllm_worker_args) - 1
-        or not vllm_worker_args[enforce_idx + 1] == "true"
-    )
-
-
-def test_vllm_boolean_arguments_special_handling():
-    """Test that vLLM boolean arguments with special defaults are handled correctly"""
-    # Reset singleton instance
-    ServiceConfig._instance = None
-
-    # Set environment variable with vLLM boolean configs
-    os.environ[
-        "DYNAMO_SERVICE_CONFIG"
-    ] = """
-    {
-        "VllmWorker": {
-            "enable-prefix-caching": false,
-            "use-tqdm-on-load": false,
-            "multi-step-stream-outputs": false,
-            "some-other-flag": false
-        }
-    }
-    """
-
-    # Get arguments and verify vLLM special boolean handling
-    service_config = ServiceConfig.get_instance()
-    vllm_worker_args = service_config.as_args("VllmWorker")
-
-    # Check that enable-prefix-caching false uses negative flag
-    assert "--no-enable-prefix-caching" in vllm_worker_args
-    assert "--enable-prefix-caching" not in vllm_worker_args
-
-    # Check that use-tqdm-on-load false uses negative flag
-    assert "--no-use-tqdm-on-load" in vllm_worker_args
-    assert "--use-tqdm-on-load" not in vllm_worker_args
-
-    # Check that multi-step-stream-outputs false uses negative flag
-    assert "--no-multi-step-stream-outputs" in vllm_worker_args
-    assert "--multi-step-stream-outputs" not in vllm_worker_args
-
-    # Check that other false flags are omitted (standard behavior)
-    assert "--some-other-flag" not in vllm_worker_args
--- a/deploy/sdk/src/dynamo/sdk/tests/test_e2e_args.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/test_e2e_args.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import subprocess
-import time
-
-import pytest
-from typer.testing import CliRunner
-
-from dynamo.sdk.cli.cli import cli
-
-runner = CliRunner()
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup_and_teardown():
-    # Setup code
-    nats_server = subprocess.Popen(["nats-server", "-js"])
-    etcd = subprocess.Popen(["etcd"])
-    print("Setting up resources")
-
-    # Run the serve command in dry-run mode with CLI runner to check it's working
-    result = runner.invoke(
-        cli,
-        [
-            "serve",
-            "pipeline:Frontend",
-            "--working-dir",
-            "deploy/sdk/src/dynamo/sdk/tests",
-            "--Frontend.model=qwentastic",
-            "--Middle.bias=0.5",
-            "--dry-run",
-        ],
-    )
-
-    # Now start the actual server using subprocess for the real integration test
-    server = subprocess.Popen(
-        [
-            "dynamo",
-            "serve",
-            "pipeline:Frontend",
-            "--working-dir",
-            "deploy/sdk/src/dynamo/sdk/tests",
-            "--Frontend.model=qwentastic",
-            "--Middle.bias=0.5",
-        ]
-    )
-
-    time.sleep(5)
-
-    yield result
-
-    # Teardown code
-    print("Tearing down resources")
-    server.terminate()
-    server.wait()
-    nats_server.terminate()
-    nats_server.wait()
-    etcd.terminate()
-    etcd.wait()
-
-
-async def test_pipeline(setup_and_teardown):
-    # Check the CLI command ran successfully
-    result = setup_and_teardown
-    assert result.exit_code == 0
-
-    # Clean the output to check for expected content
-    clean_output = re.sub(r"\x1b\[[0-9;]*m", "", result.output)
-    assert "Service Configuration:" in clean_output
-    assert '"Frontend": {' in clean_output
-    assert '"model": "qwentastic"' in clean_output
-
-    import asyncio
-
-    import aiohttp
-
-    max_retries = 5
-    for attempt in range(max_retries):
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(
-                    "http://localhost:8000/generate",
-                    json={"text": "federer-is-the-greatest-tennis-player-of-all-time"},
-                    headers={"accept": "text/event-stream"},
-                ) as resp:
-                    assert resp.status == 200
-                    text = await resp.text()
-                    assert (
-                        "federer-is-the-greatest-tennis-player-of-all-time-mid-back"
-                        in text
-                    )
-                    break
-        except Exception as e:
-            if attempt == max_retries - 1:
-                raise
-            print(f"Attempt {attempt + 1} failed, retrying... {e}")
-            await asyncio.sleep(3)
--- a/deploy/sdk/src/dynamo/sdk/tests/test_e2e_config.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/test_e2e_config.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import subprocess
-import time
-from collections import Counter
-
-import pytest
-from typer.testing import CliRunner
-
-from dynamo.sdk.cli.cli import cli
-
-runner = CliRunner()
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup_and_teardown():
-    # Setup code
-    nats_server = subprocess.Popen(["nats-server", "-js"])
-    etcd = subprocess.Popen(["etcd"])
-    print("Setting up resources")
-
-    # Run the serve command in dry-run mode with CLI runner to check it's working
-    result = runner.invoke(
-        cli,
-        [
-            "serve",
-            "pipeline:Frontend",
-            "--working-dir",
-            "deploy/sdk/src/dynamo/sdk/tests",
-            "-f",
-            "deploy/sdk/src/dynamo/sdk/tests/config.yaml",
-            "--dry-run",
-        ],
-    )
-
-    # Now start the actual server using subprocess for the real integration test
-    server = subprocess.Popen(
-        [
-            "dynamo",
-            "serve",
-            "pipeline:Frontend",
-            "--working-dir",
-            "deploy/sdk/src/dynamo/sdk/tests",
-            "-f",
-            "deploy/sdk/src/dynamo/sdk/tests/config.yaml",
-        ]
-    )
-
-    time.sleep(3)
-
-    yield result
-
-    # Teardown code
-    print("Tearing down resources")
-    server.terminate()
-    server.wait()
-    nats_server.terminate()
-    nats_server.wait()
-    etcd.terminate()
-    etcd.wait()
-
-
-async def test_pipeline(setup_and_teardown):
-    import asyncio
-
-    import aiohttp
-
-    # Check the CLI command ran successfully
-    result = setup_and_teardown
-    assert result.exit_code == 0
-
-    import psutil
-
-    # Capture list of subprocesses (children of current process)
-    current_process = psutil.Process()
-    child_processes = list(current_process.children(recursive=True))
-    # Assert their name and command line
-    service_count: Counter[str] = Counter()
-    for proc in child_processes:
-        try:
-            cmd = proc.cmdline()
-            if "--service-name" in " ".join(cmd):
-                idx = cmd.index("--service-name")
-                service_name = cmd[idx + 1]
-                service_count[service_name] += 1
-            # assert "dynamo" in name.lower() or "dynamo" in " ".join(cmdline).lower()
-            # assert any("serve" in arg for arg in cmdline)
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            continue
-
-    assert service_count["Frontend"] == 1
-    assert service_count["Backend"] == 3
-    assert service_count["Middle"] == 2
-
-    # Clean the output to check for expected content
-    clean_output = re.sub(r"\x1b\[[0-9;]*m", "", result.output)
-    assert "Service Configuration:" in clean_output
-    max_retries = 5
-    for attempt in range(max_retries):
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(
-                    "http://localhost:8000/generate",
-                    json={"text": "federer-is-the-greatest-tennis-player-of-all-time"},
-                    headers={"accept": "text/event-stream"},
-                ) as resp:
-                    assert resp.status == 200
-                    text = await resp.text()
-                    assert (
-                        "federer-is-the-greatest-tennis-player-of-all-time-mid-back"
-                        in text
-                    )
-                    break
-        except Exception as e:
-            if attempt == max_retries - 1:
-                raise
-            print(f"Attempt {attempt + 1} failed, retrying... {e}")
-            await asyncio.sleep(3)
--- a/deploy/sdk/src/dynamo/sdk/tests/test_link.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/test_link.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from dynamo.sdk.core.protocol.interface import LinkedServices
-
-
-def test_remove_backend2():
-    from dynamo.sdk.tests.pipeline import Backend, Backend2, Frontend, Middle
-
-    # Initial state assertions
-    assert set(Frontend.dependencies.keys()) == {"backend", "middle"}
-    assert Frontend.dependencies["backend"].on == Backend
-    assert Frontend.dependencies["middle"].on == Middle
-
-    assert set(Middle.dependencies.keys()) == {"backend", "backend2"}
-    assert Middle.dependencies["backend"].on == Backend
-    assert Middle.dependencies["backend2"].on == Backend2
-
-    assert Backend.dependencies == {}
-
-    Frontend.link(Middle).link(Backend)
-    LinkedServices.remove_unused_edges()
-
-    # Final state assertions after linking and cleanup
-    assert Frontend.dependencies["middle"].on == Middle
-    assert set(Frontend.dependencies.keys()) == {"middle"}
-
-    assert set(Middle.dependencies.keys()) == {"backend"}
-    assert Middle.dependencies["backend"].on == Backend
-
-    assert Backend.dependencies == {}
--- a/deploy/sdk/src/dynamo/sdk/tests/test_resources.py
+++ b/deploy/sdk/src/dynamo/sdk/tests/test_resources.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from dynamo.sdk.cli.utils import configure_target_environment
-from dynamo.sdk.core.protocol.interface import ServiceInterface
-from dynamo.sdk.core.runner import TargetEnum
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup_and_teardown():
-    configure_target_environment(TargetEnum.DYNAMO)
-    yield
-
-
-def test_gpu_resources(setup_and_teardown):
-    """Test resource configurations"""
-
-    from dynamo.sdk import service
-
-    @service(
-        resources={"cpu": "2", "gpu": "1", "memory": "4Gi"},
-        dynamo={"namespace": "test"},
-    )
-    class MyService:
-        def __init__(self) -> None:
-            pass
-
-    dyn_svc: ServiceInterface = MyService
-    assert dyn_svc.config is not None  # type: ignore
-    assert dyn_svc.config.resources.cpu == "2"
-    assert dyn_svc.config.resources.gpu == "1"
-    assert dyn_svc.config.resources.memory == "4Gi"
-
-
-def test_gpu_resources_coercing_from_integers(setup_and_teardown):
-    """Test resource configurations"""
-
-    from dynamo.sdk import service
-
-    @service(
-        resources={"cpu": 3, "gpu": 4, "memory": "4Gi"},
-        dynamo={"namespace": "test"},
-    )
-    class MockService:
-        def __init__(self) -> None:
-            pass
-
-    dyn_svc: ServiceInterface = MockService
-    assert dyn_svc.config is not None  # type: ignore
-    assert dyn_svc.config.resources.cpu == "3"
-    assert dyn_svc.config.resources.gpu == "4"
-    assert dyn_svc.config.resources.memory == "4Gi"
--- a/deploy/sdk/tests/test_deployment.sh
+++ b/deploy/sdk/tests/test_deployment.sh
-#!/bin/bash
-#!/bin/bash -e
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-set -euo pipefail
-
-export DYNAMO_CLOUD="${DYNAMO_CLOUD:-http://dynamo-cloud}"
-export DYNAMO_IMAGE="${DYNAMO_IMAGE:-dynamo-base:latest}"
-export DEPLOYMENT_NAME="${DEPLOYMENT_NAME:-ci-hw}"
-
-cd /workspace/examples/hello_world
-
-# Step.1: Login to  dynamo cloud
-dynamo cloud login $DYNAMO_CLOUD
-
-# Step.2:  build a dynamo nim with framework-less base
-DYNAMO_TAG=$(dynamo build hello_world:Frontend | grep "Successfully built" | awk -F"\"" '{ print $2 }')
-
-# Step.3: Deploy!
-echo $DYNAMO_TAG
-# TODO: Deploy your service using a DynamoGraphDeployment CR.
--- a/docs/architecture/distributed_runtime.md
+++ b/docs/architecture/distributed_runtime.md
@@ -34,7 +34,7 @@ For example, the deployment configuration `examples/llm/configs/disagg.yaml` hav
 - `Processor`: When a new request arrives, `Processor` applies the chat template and perform the tokenization. Then, it route the request to the `VllmWorker`.
 - `VllmWorker` and `PrefillWorker`: Perform the actual decode and prefill computation.

-Since the four workers are deployed in different processes, each of them have their own `DistributedRuntime`. Within their own `DistributedRuntime`, they all have their own `Namespace`s named `dynamo`. Then, under their own `dynamo` namespace, they have their own `Component`s named `Frontend/Processor/VllmWorker/PrefillWorker`. Lastly, for the `Endpoint`, `Frontend` has no `Endpoints`, `Processor` and `VllmWorker` each has a `generate` endpoint, and `PrefillWorker` has a placeholder `mock` endpoint. Their `DistributedRuntime`s and `Namespace`s are set in the `@service` decorators in `examples/llm/components/<frontend/processor/worker/prefill_worker>.py`. Their `Component`s are set by their name in `/deploy/dynamo/sdk/src/dynamo/sdk/cli/serve_dynamo.py`. Their `Endpoint`s are set by the `@endpoint` decorators in `examples/llm/components/<frontend/processor/worker/prefill_worker>.py`.
+Since the four workers are deployed in different processes, each of them have their own `DistributedRuntime`. Within their own `DistributedRuntime`, they all have their own `Namespace`s named `dynamo`. Then, under their own `dynamo` namespace, they have their own `Component`s named `Frontend/Processor/VllmWorker/PrefillWorker`. Lastly, for the `Endpoint`, `Frontend` has no `Endpoints`, `Processor` and `VllmWorker` each has a `generate` endpoint, and `PrefillWorker` has a placeholder `mock` endpoint.

 ## Initialization


--- a/docs/dynamo_glossary.md
+++ b/docs/dynamo_glossary.md
@@ -24,8 +24,6 @@
 **Dynamo Cloud** - A Kubernetes platform providing managed deployment experience for Dynamo inference graphs.

 ## E
-**@endpoint** - A Python decorator used to define service endpoints within a Dynamo component.
-
 **Endpoint** - A specific network-accessible API within a Dynamo component, such as `generate` or `load_metrics`.

 ## F
@@ -70,8 +68,6 @@
 **RDMA (Remote Direct Memory Access)** - Technology that allows direct memory access between distributed systems, used for efficient KV cache transfers.

 ## S
-**@service** - Python decorator used to define a Dynamo service class.
-
 **SGLang** - Fast LLM inference framework with native embedding support and RadixAttention.

 ## T

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,7 +90,6 @@ build-backend = "hatchling.build"

 [tool.hatch.build.targets.wheel]
 packages = [
-    "deploy/sdk/src/dynamo",
    "components/frontend/src/dynamo",
    "components/planner/src/dynamo",
    "components/backends/llama_cpp/src/dynamo",