refactor: move GMS to standalone component (#5616)

92ecd308 · Schwinn Saereesitthipitak · GitHub · 7fe89c74 · 7fe89c74 · 92ecd308
Unverified Commit 92ecd308 authored Jan 23, 2026 by Schwinn Saereesitthipitak Committed by GitHub Jan 24, 2026
6 changed files
--- a/components/src/dynamo/gpu_memory_service/__init__.py
+++ b/components/src/dynamo/gpu_memory_service/__init__.py
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""GPU Memory Service component for Dynamo.
-
-This module provides the Dynamo component wrapper around the gpu_memory_service package.
-The core functionality is in the gpu_memory_service package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
-"""
-
-# Re-export core functionality from gpu_memory_service package
-from gpu_memory_service import (
-    GMSClientMemoryManager,
-    StaleMemoryLayoutError,
-    get_gms_client_memory_manager,
-    get_or_create_gms_client_memory_manager,
-)
-
-# Re-export extensions (built separately)
-try:
-    from gpu_memory_service.client.torch.extensions import _allocator_ext
-except (ImportError, OSError):
-    _allocator_ext = None
-
-# Re-export module utilities
-from gpu_memory_service.client.torch.module import (
-    materialize_module_from_gms,
-    register_module_tensors,
-)
-
-__all__ = [
-    # Core
-    "GMSClientMemoryManager",
-    "StaleMemoryLayoutError",
-    # GMS client memory manager
-    "get_or_create_gms_client_memory_manager",
-    "get_gms_client_memory_manager",
-    # Tensor utilities
-    "register_module_tensors",
-    "materialize_module_from_gms",
-    # Extensions
-    "_allocator_ext",
-]
--- a/components/src/dynamo/gpu_memory_service/__main__.py
+++ b/components/src/dynamo/gpu_memory_service/__main__.py
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-from dynamo.gpu_memory_service.server import main
+from gpu_memory_service.cli.runner import main

 if __name__ == "__main__":
    main()
--- a/lib/gpu_memory_service/cli/__init__.py
+++ b/lib/gpu_memory_service/cli/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""CLI for GPU Memory Service."""
+
+from gpu_memory_service.cli.args import Config, parse_args
+from gpu_memory_service.cli.runner import main
+
+__all__ = [
+    "Config",
+    "parse_args",
+    "main",
+]
--- a/components/src/dynamo/gpu_memory_service/args.py
+++ b/components/src/dynamo/gpu_memory_service/args.py
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-"""Argument parsing for GPU Memory Service server component."""
+"""Argument parsing for GPU Memory Service server."""

 import argparse
 import logging
@@ -23,7 +23,7 @@ class Config:
 def parse_args() -> Config:
    """Parse command line arguments for GPU Memory Service server."""
    parser = argparse.ArgumentParser(
-        description="GPU Memory Service allocation server for Dynamo."
+        description="GPU Memory Service allocation server."
    )

    # GPU Memory Service specific arguments

--- a/components/src/dynamo/gpu_memory_service/server.py
+++ b/components/src/dynamo/gpu_memory_service/server.py
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-"""GPU Memory Service allocation server component for Dynamo.
+"""GPU Memory Service allocation server runner.

-This component wraps the GMSRPCServer from gpu_memory_service to manage
-GPU memory allocations with connection-based RW/RO locking.
-
-Workers connect via the socket path, which should be passed to vLLM/SGLang via:
-    --load-format gpu_memory_service
-    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+This module provides the CLI runner for the GPU Memory Service server,
+which manages GPU memory allocations with connection-based RW/RO locking.

 Usage:
-    python -m dynamo.gpu_memory_service --device 0
-    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+    python -m gpu_memory_service --device 0
+    python -m gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
 """

 import asyncio
@@ -38,7 +34,7 @@ async def worker() -> None:
    # Configure logging level
    if config.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
-        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+        logging.getLogger("gpu_memory_service").setLevel(logging.DEBUG)

    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
    logger.info(f"Socket path: {config.socket_path}")
@@ -59,10 +55,7 @@ async def worker() -> None:
    await server.start()

    logger.info("GPU Memory Service Server ready, waiting for connections...")
-    logger.info(
-        f"To connect vLLM workers, use: --load-format gpu_memory_service "
-        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
-    )
+    logger.info(f"Clients can connect via socket: {config.socket_path}")

    # Wait for shutdown signal
    try:

--- a/lib/gpu_memory_service/pyproject.toml
+++ b/lib/gpu_memory_service/pyproject.toml
@@ -34,6 +34,9 @@ classifiers = [
 ]
 keywords = ["llm", "genai", "inference", "nvidia", "gpu", "memory", "dynamo"]

+[project.scripts]
+gpu-memory-service = "gpu_memory_service.cli.runner:main"
+
 [project.optional-dependencies]
 test = [
    "pytest>=8.3.4",