Unverified Commit 92ecd308 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: move GMS to standalone component (#5616)

parent 7fe89c74
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service component for Dynamo.
This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory_service package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""
# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
# Re-export extensions (built separately)
try:
from gpu_memory_service.client.torch.extensions import _allocator_ext
except (ImportError, OSError):
_allocator_ext = None
# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from dynamo.gpu_memory_service.server import main from gpu_memory_service.cli.runner import main
if __name__ == "__main__": if __name__ == "__main__":
main() main()
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""CLI for GPU Memory Service."""
from gpu_memory_service.cli.args import Config, parse_args
from gpu_memory_service.cli.runner import main
__all__ = [
"Config",
"parse_args",
"main",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Argument parsing for GPU Memory Service server component.""" """Argument parsing for GPU Memory Service server."""
import argparse import argparse
import logging import logging
...@@ -23,7 +23,7 @@ class Config: ...@@ -23,7 +23,7 @@ class Config:
def parse_args() -> Config: def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server.""" """Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo." description="GPU Memory Service allocation server."
) )
# GPU Memory Service specific arguments # GPU Memory Service specific arguments
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service allocation server component for Dynamo. """GPU Memory Service allocation server runner.
This component wraps the GMSRPCServer from gpu_memory_service to manage This module provides the CLI runner for the GPU Memory Service server,
GPU memory allocations with connection-based RW/RO locking. which manages GPU memory allocations with connection-based RW/RO locking.
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
Usage: Usage:
python -m dynamo.gpu_memory_service --device 0 python -m gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock python -m gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
""" """
import asyncio import asyncio
...@@ -38,7 +34,7 @@ async def worker() -> None: ...@@ -38,7 +34,7 @@ async def worker() -> None:
# Configure logging level # Configure logging level
if config.verbose: if config.verbose:
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG) logging.getLogger("gpu_memory_service").setLevel(logging.DEBUG)
logger.info(f"Starting GPU Memory Service Server for device {config.device}") logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}") logger.info(f"Socket path: {config.socket_path}")
...@@ -59,10 +55,7 @@ async def worker() -> None: ...@@ -59,10 +55,7 @@ async def worker() -> None:
await server.start() await server.start()
logger.info("GPU Memory Service Server ready, waiting for connections...") logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info( logger.info(f"Clients can connect via socket: {config.socket_path}")
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)
# Wait for shutdown signal # Wait for shutdown signal
try: try:
......
...@@ -34,6 +34,9 @@ classifiers = [ ...@@ -34,6 +34,9 @@ classifiers = [
] ]
keywords = ["llm", "genai", "inference", "nvidia", "gpu", "memory", "dynamo"] keywords = ["llm", "genai", "inference", "nvidia", "gpu", "memory", "dynamo"]
[project.scripts]
gpu-memory-service = "gpu_memory_service.cli.runner:main"
[project.optional-dependencies] [project.optional-dependencies]
test = [ test = [
"pytest>=8.3.4", "pytest>=8.3.4",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment