Unverified Commit 92ecd308 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: move GMS to standalone component (#5616)

parent 7fe89c74
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service component for Dynamo.
This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory_service package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""
# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
# Re-export extensions (built separately)
try:
from gpu_memory_service.client.torch.extensions import _allocator_ext
except (ImportError, OSError):
_allocator_ext = None
# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.gpu_memory_service.server import main
from gpu_memory_service.cli.runner import main
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI for GPU Memory Service."""
from gpu_memory_service.cli.args import Config, parse_args
from gpu_memory_service.cli.runner import main
__all__ = [
"Config",
"parse_args",
"main",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Argument parsing for GPU Memory Service server component."""
"""Argument parsing for GPU Memory Service server."""
import argparse
import logging
......@@ -23,7 +23,7 @@ class Config:
def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo."
description="GPU Memory Service allocation server."
)
# GPU Memory Service specific arguments
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service allocation server component for Dynamo.
"""GPU Memory Service allocation server runner.
This component wraps the GMSRPCServer from gpu_memory_service to manage
GPU memory allocations with connection-based RW/RO locking.
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
This module provides the CLI runner for the GPU Memory Service server,
which manages GPU memory allocations with connection-based RW/RO locking.
Usage:
python -m dynamo.gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
python -m gpu_memory_service --device 0
python -m gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
"""
import asyncio
......@@ -38,7 +34,7 @@ async def worker() -> None:
# Configure logging level
if config.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
logging.getLogger("gpu_memory_service").setLevel(logging.DEBUG)
logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}")
......@@ -59,10 +55,7 @@ async def worker() -> None:
await server.start()
logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info(
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)
logger.info(f"Clients can connect via socket: {config.socket_path}")
# Wait for shutdown signal
try:
......
......@@ -34,6 +34,9 @@ classifiers = [
]
keywords = ["llm", "genai", "inference", "nvidia", "gpu", "memory", "dynamo"]
[project.scripts]
gpu-memory-service = "gpu_memory_service.cli.runner:main"
[project.optional-dependencies]
test = [
"pytest>=8.3.4",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment