"components/vscode:/vscode.git/clone" did not exist on "c90e3dff7e774d5170a83823eb225214bcc9f9ab"
Unverified Commit 30c6228b authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat: GPU Memory Service (#5286)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent cde3b2a5
...@@ -111,7 +111,8 @@ $SANITY_STATUS ...@@ -111,7 +111,8 @@ $SANITY_STATUS
Now build the project: Now build the project:
cargo build --locked --profile dev --features dynamo-llm/block-manager cargo build --locked --profile dev --features dynamo-llm/block-manager
cd lib/bindings/python && maturin develop --uv cd lib/bindings/python && maturin develop --uv
DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e . uv pip install -e lib/gpu_memory_service # GPU memory manager with C++ extension
DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support
......
...@@ -45,6 +45,10 @@ container/Dockerfile* ...@@ -45,6 +45,10 @@ container/Dockerfile*
.venv .venv
.venv-docs .venv-docs
# GPU Memory Service build artifacts
lib/gpu_memory_service/build/
lib/gpu_memory_service/*.egg-info/
lib/gpu_memory_service/**/*.so
# Python # Python
__pycache__/ __pycache__/
......
...@@ -78,6 +78,7 @@ core: ...@@ -78,6 +78,7 @@ core:
- 'components/src/dynamo/mocker/**' - 'components/src/dynamo/mocker/**'
- 'components/src/dynamo/frontend/**' - 'components/src/dynamo/frontend/**'
- 'components/src/dynamo/common/**' - 'components/src/dynamo/common/**'
- 'components/src/dynamo/gpu_memory_service/**'
- '*.toml' - '*.toml'
- '*.lock' - '*.lock'
- '*.py' - '*.py'
......
...@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/ ...@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
tensorrtllm_engines/ tensorrtllm_engines/
api_server_models/ api_server_models/
server/ server/
!lib/gpu_memory_service/server/
# Replay/Snapshot test artifacts # Replay/Snapshot test artifacts
*.new *.new
lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/ lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
......
...@@ -331,7 +331,16 @@ cd lib/bindings/python ...@@ -331,7 +331,16 @@ cd lib/bindings/python
maturin develop --uv maturin develop --uv
``` ```
## 6. Install the Wheel ## 6. Install GPU Memory Service
The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
```bash
cd $PROJECT_ROOT
uv pip install -e lib/gpu_memory_service
```
## 7. Install the Wheel
``` ```
cd $PROJECT_ROOT cd $PROJECT_ROOT
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service component for Dynamo.
This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory_service package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""
# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
# Re-export extensions (built separately)
try:
from gpu_memory_service.client.torch.extensions import _allocator_ext
except (ImportError, OSError):
_allocator_ext = None
# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.gpu_memory_service.server import main
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Argument parsing for GPU Memory Service server component."""
import argparse
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class Config:
"""Configuration for GPU Memory Service server."""
# GPU Memory Service specific
device: int
socket_path: str
verbose: bool
def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo."
)
# GPU Memory Service specific arguments
parser.add_argument(
"--device",
type=int,
required=True,
help="CUDA device ID to manage memory for.",
)
parser.add_argument(
"--socket-path",
type=str,
default=None,
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
"Supports {device} placeholder for multi-GPU setups.",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging.",
)
args = parser.parse_args()
# Generate default socket path if not provided
socket_path = args.socket_path
if socket_path is None:
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
else:
# Expand {device} placeholder
socket_path = socket_path.format(device=args.device)
config = Config(
device=args.device,
socket_path=socket_path,
verbose=args.verbose,
)
return config
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service allocation server component for Dynamo.
This component wraps the GMSRPCServer from gpu_memory_service to manage
GPU memory allocations with connection-based RW/RO locking.
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
Usage:
python -m dynamo.gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
"""
import asyncio
import logging
import signal
import uvloop
from gpu_memory_service.server import GMSRPCServer
from .args import parse_args
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
async def worker() -> None:
"""Main async worker function."""
config = parse_args()
# Configure logging level
if config.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}")
server = GMSRPCServer(config.socket_path, device=config.device)
# Set up shutdown handling
shutdown_event = asyncio.Event()
def signal_handler():
logger.info("Received shutdown signal")
shutdown_event.set()
loop = asyncio.get_running_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
await server.start()
logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info(
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)
# Wait for shutdown signal
try:
await shutdown_event.wait()
finally:
logger.info("Shutting down GPU Memory Service Server...")
await server.stop()
logger.info("GPU Memory Service Server shutdown complete")
def main() -> None:
"""Entry point for GPU Memory Service server."""
uvloop.install()
asyncio.run(worker())
if __name__ == "__main__":
main()
...@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen ...@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
############################################## ##############################################
########## Runtime image ############## ########## Runtime image ##############
############################################## ##############################################
...@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \ ...@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
# Install dynamo wheels (runtime packages only, no test dependencies) # Install dynamo wheels (runtime packages only, no test dependencies)
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
...@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi ...@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--requirement /tmp/requirements.test.txt --requirement /tmp/requirements.test.txt
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
......
...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG ...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
################################## ##################################
########## Runtime Image ######### ########## Runtime Image #########
################################## ##################################
...@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src ...@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}" ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages) # Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=bind,source=.,target=/mnt/local_src \ RUN --mount=type=bind,source=.,target=/mnt/local_src \
pip install --no-cache-dir --break-system-packages \ pip install --no-cache-dir --break-system-packages \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION} sglang==${SGLANG_VERSION} && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
fi
# Install common and test dependencies as root # Install common and test dependencies as root
RUN --mount=type=bind,source=.,target=/mnt/local_src \ RUN --mount=type=bind,source=.,target=/mnt/local_src \
......
...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG ...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
################################################## ##################################################
########## Framework Builder Stage ############## ########## Framework Builder Stage ##############
################################################## ##################################################
...@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/ ...@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
# Install dynamo, NIXL, and dynamo-specific dependencies # Install dynamo, NIXL, and dynamo-specific dependencies
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
RUN uv pip install \ RUN uv pip install \
--no-cache \ --no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install --no-cache "$GMS_WHEEL"; \
fi && \
if [ "${ENABLE_KVBM}" = "true" ]; then \ if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
......
...@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG ...@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
######################################################## ########################################################
########## Framework Development Image ################ ########## Framework Development Image ################
######################################################## ########################################################
...@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin ...@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin
COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/ COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=dynamo_base /usr/local/cuda/lib64/stubs/ /usr/local/cuda/lib64/stubs/
RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\ RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\ ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so
...@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/ ...@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
# Install dynamo, NIXL, and dynamo-specific dependencies # Install dynamo, NIXL, and dynamo-specific dependencies
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "${ENABLE_KVBM}" = "true" ]; then \ if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
...@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \ ...@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \
ldconfig ldconfig
USER dynamo USER dynamo
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
......
...@@ -156,6 +156,10 @@ PUSH="" ...@@ -156,6 +156,10 @@ PUSH=""
# or can be explicitly enabled via --enable-kvbm flag # or can be explicitly enabled via --enable-kvbm flag
ENABLE_KVBM=false ENABLE_KVBM=false
# GPU Memory Service - default disabled, enabled automatically for VLLM/SGLANG
# or can be explicitly enabled via --enable-gpu-memory-service flag
ENABLE_GPU_MEMORY_SERVICE=false
# sccache configuration for S3 # sccache configuration for S3
USE_SCCACHE="" USE_SCCACHE=""
SCCACHE_BUCKET="" SCCACHE_BUCKET=""
...@@ -343,6 +347,9 @@ get_options() { ...@@ -343,6 +347,9 @@ get_options() {
--enable-kvbm) --enable-kvbm)
ENABLE_KVBM=true ENABLE_KVBM=true
;; ;;
--enable-gpu-memory-service)
ENABLE_GPU_MEMORY_SERVICE=true
;;
--enable-media-nixl) --enable-media-nixl)
ENABLE_MEDIA_NIXL=true ENABLE_MEDIA_NIXL=true
;; ;;
...@@ -539,6 +546,7 @@ show_help() { ...@@ -539,6 +546,7 @@ show_help() {
echo " [--release-build perform a release build]" echo " [--release-build perform a release build]"
echo " [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]" echo " [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]"
echo " [--enable-kvbm Enables KVBM support in Python 3.12]" echo " [--enable-kvbm Enables KVBM support in Python 3.12]"
echo " [--enable-gpu-memory-service Enables GPU Memory Service support]"
echo " [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]" echo " [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]"
echo " [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]" echo " [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]"
echo " [--use-sccache enable sccache for Rust/C/C++ compilation caching]" echo " [--use-sccache enable sccache for Rust/C/C++ compilation caching]"
...@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then ...@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then
BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} " BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} "
fi fi
# ENABLE_GPU_MEMORY_SERVICE: Used in Dockerfiles for gpu_memory_service wheel.
# Declared but not currently used in Dockerfile.trtllm.
# Force GPU Memory Service to be enabled for VLLM and SGLANG frameworks
if [[ $FRAMEWORK == "VLLM" ]] || [[ $FRAMEWORK == "SGLANG" ]]; then
echo "Forcing enable_gpu_memory_service to true in ${FRAMEWORK} image build"
ENABLE_GPU_MEMORY_SERVICE=true
fi
# For other frameworks, ENABLE_GPU_MEMORY_SERVICE defaults to false unless --enable-gpu-memory-service flag was provided
if [[ ${ENABLE_GPU_MEMORY_SERVICE} == "true" ]]; then
echo "Enabling GPU Memory Service in the dynamo image"
BUILD_ARGS+=" --build-arg ENABLE_GPU_MEMORY_SERVICE=${ENABLE_GPU_MEMORY_SERVICE} "
fi
# ENABLE_MEDIA_NIXL: Enable media processing with NIXL support # ENABLE_MEDIA_NIXL: Enable media processing with NIXL support
# Used in base Dockerfile for maturin build feature flag. # Used in base Dockerfile for maturin build feature flag.
# Can be explicitly overridden with --enable-media-nixl flag # Can be explicitly overridden with --enable-media-nixl flag
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service - out-of-process GPU memory manager.
The GPU Memory Service decouples ownership of GPU memory from the processes
that use it, enabling zero-copy sharing and data survival across process crashes.
Package structure:
- common/: Shared types and protocol (used by both server and client)
- server/: Allocation server daemon (no CUDA context required)
- client/: Client library for memory management
- client/torch/: PyTorch integration (allocator, tensor, module, extensions)
Primary client API:
from gpu_memory_service import (
GMSClientMemoryManager,
get_or_create_gms_client_memory_manager,
get_gms_client_memory_manager,
)
Server API:
from gpu_memory_service.server import GMSRPCServer
"""
# Primary client exports
from gpu_memory_service.client.memory_manager import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
)
# PyTorch integration (GMS client memory manager)
from gpu_memory_service.client.torch.allocator import (
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
__all__ = [
# Client
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service client library.
This module provides the client-side components for interacting with the
GPU Memory Service:
- GMSClientMemoryManager: Manages local VA mappings of remote GPU memory
- GMSRPCClient: Low-level RPC client (pure Python, no PyTorch dependency)
For PyTorch integration (MemPool, tensor utilities), see gpu_memory_service.client.torch.
"""
from gpu_memory_service.client.memory_manager import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
)
from gpu_memory_service.client.rpc import GMSRPCClient
__all__ = [
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
"GMSRPCClient",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Client-side CUDA VMM utilities.
These functions wrap CUDA driver API calls used by the client memory manager
for importing, mapping, and unmapping GPU memory.
"""
from __future__ import annotations
from cuda.bindings import driver as cuda
from gpu_memory_service.common.cuda_vmm_utils import check_cuda_result
from gpu_memory_service.common.types import GrantedLockType
def import_handle_from_fd(fd: int) -> int:
"""Import a CUDA memory handle from a file descriptor.
Args:
fd: POSIX file descriptor received via SCM_RIGHTS.
Returns:
CUDA memory handle.
"""
result, handle = cuda.cuMemImportFromShareableHandle(
fd,
cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
)
check_cuda_result(result, "cuMemImportFromShareableHandle")
return int(handle)
def reserve_va(size: int, granularity: int) -> int:
"""Reserve virtual address space.
Args:
size: Size in bytes (should be aligned to granularity).
granularity: VMM allocation granularity.
Returns:
Reserved virtual address.
"""
result, va = cuda.cuMemAddressReserve(size, granularity, 0, 0)
check_cuda_result(result, "cuMemAddressReserve")
return int(va)
def free_va(va: int, size: int) -> None:
"""Free a virtual address reservation.
Args:
va: Virtual address to free.
size: Size of the reservation.
"""
(result,) = cuda.cuMemAddressFree(va, size)
check_cuda_result(result, "cuMemAddressFree")
def map_to_va(va: int, size: int, handle: int) -> None:
"""Map a CUDA handle to a virtual address.
Args:
va: Virtual address (must be reserved).
size: Size of the mapping.
handle: CUDA memory handle.
"""
(result,) = cuda.cuMemMap(va, size, 0, handle, 0)
check_cuda_result(result, "cuMemMap")
def set_access(va: int, size: int, device: int, access: GrantedLockType) -> None:
"""Set access permissions for a mapped region.
Args:
va: Virtual address.
size: Size of the region.
device: CUDA device index.
access: Access mode - RO for read-only, RW for read-write.
"""
acc = cuda.CUmemAccessDesc()
acc.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
acc.location.id = device
acc.flags = (
cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READ
if access == GrantedLockType.RO
else cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
)
(result,) = cuda.cuMemSetAccess(va, size, [acc], 1)
check_cuda_result(result, "cuMemSetAccess")
def unmap(va: int, size: int) -> None:
"""Unmap a virtual address region.
Args:
va: Virtual address to unmap.
size: Size of the mapping.
"""
(result,) = cuda.cuMemUnmap(va, size)
check_cuda_result(result, "cuMemUnmap")
def release_handle(handle: int) -> None:
"""Release a CUDA memory handle.
Args:
handle: CUDA memory handle to release.
"""
(result,) = cuda.cuMemRelease(handle)
check_cuda_result(result, "cuMemRelease")
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service client-side memory manager.
This is the unified memory manager for the GPU Memory Service architecture.
Key properties:
- Uses GMSRPCClient over a Unix-domain socket.
- The socket connection itself is the RW/RO lock.
- In write mode, the manager can allocate + map RW and then publish via commit().
- In read mode, the manager can import + map RO and hold the RO lock during inference.
- sleep()/wake() releases and reacquires the RO lock (and remaps allocations).
This module uses cuda-python bindings for CUDA driver API calls:
- import FDs (cuMemImportFromShareableHandle)
- reserve VA (cuMemAddressReserve)
- map/unmap (cuMemMap/cuMemUnmap)
- enforce access (cuMemSetAccess)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Dict, List, Optional
import torch
from cuda.bindings import driver as cuda
from gpu_memory_service.client.cuda_vmm_utils import (
free_va,
import_handle_from_fd,
map_to_va,
release_handle,
reserve_va,
set_access,
unmap,
)
from gpu_memory_service.client.rpc import GMSRPCClient
from gpu_memory_service.common.cuda_vmm_utils import (
align_to_granularity,
get_allocation_granularity,
)
from gpu_memory_service.common.types import GrantedLockType, RequestedLockType
logger = logging.getLogger(__name__)
class StaleMemoryLayoutError(Exception):
"""Raised when memory layout was modified while sleeping.
This error indicates that a writer acquired the RW lock and changed the
allocation structure (different sizes, different tensor layouts) while this
reader was sleeping. The caller should re-import the model from scratch.
IMPORTANT: This is a LAYOUT check, NOT a CONTENT check.
- Detected: Allocation sizes changed, tensors added/removed, metadata structure changed
- NOT detected: Weight values modified in-place
This design is intentional: sleep/wake enables use cases like RL training
where another process can write to the same memory locations (e.g., updating
weights) while preserving the structure. As long as the layout (allocation
and metadata table hashes) remains identical, wake() succeeds.
"""
pass
@dataclass(frozen=True)
class LocalMapping:
"""Immutable record of a local VA mapping."""
allocation_id: str
va: int
size: int
aligned_size: int
handle: int # 0 if unmapped but VA reserved
tag: str
access: GrantedLockType
def with_handle(self, handle: int) -> "LocalMapping":
return LocalMapping(
self.allocation_id,
self.va,
self.size,
self.aligned_size,
handle,
self.tag,
self.access,
)
def with_access(self, access: GrantedLockType) -> "LocalMapping":
return LocalMapping(
self.allocation_id,
self.va,
self.size,
self.aligned_size,
self.handle,
self.tag,
access,
)
class GMSClientMemoryManager:
"""Unified memory manager that can act as writer or reader.
Modes:
- mode=RequestedLockType.RW: acquire RW lock, allocate/map RW, mutate metadata, commit/publish.
- mode=RequestedLockType.RO: acquire RO lock (READY only), import/map RO, sleep/wake.
- mode=RequestedLockType.RW_OR_RO: try RW if available, else wait for RO.
"""
def __init__(
self,
socket_path: str,
*,
mode: RequestedLockType,
device: int = 0,
timeout_ms: Optional[int] = None,
) -> None:
self.socket_path = socket_path
self.device = device
self._timeout_ms = timeout_ms
self._client: Optional[GMSRPCClient] = None
self._mappings: Dict[int, LocalMapping] = {} # va -> mapping
self._allocation_id_to_va: Dict[str, int] = {}
self._sleeping = False
self._closed = False
self._preserved_allocation_ids: List[str] = []
self._published = False
self._mode: Optional[GrantedLockType] = None # Updated by _connect
# VA-stable sleep/wake state
self._va_preserved = False
self._last_memory_layout_hash: str = (
"" # Hash from server, saved on connect/commit
)
# Ensure torch is on the right device for subsequent CUDA operations.
if torch.cuda.is_available():
torch.cuda.set_device(self.device)
# Cache granularity for VA alignment
self.granularity = get_allocation_granularity(device)
self._connect(lock_type=mode, timeout_ms=timeout_ms)
def _connect(
self,
*,
lock_type: RequestedLockType,
timeout_ms: Optional[int],
update_memory_layout_hash: bool = True,
) -> None:
self._client = GMSRPCClient(
self.socket_path, lock_type=lock_type, timeout_ms=timeout_ms
)
self._sleeping = False
# Update mode based on granted lock type (may differ from requested for rw_or_ro)
self._mode = self._client.lock_type
# Save state hash for stale detection on wake (skip during wake itself)
if update_memory_layout_hash and self._client.committed:
self._last_memory_layout_hash = self._client.get_memory_layout_hash()
@property
def mode(self) -> Optional[GrantedLockType]:
"""Current mode of the memory manager."""
return self._mode
@property
def lock_type(self) -> Optional[GrantedLockType]:
"""Get the lock type actually granted by the server."""
if self._client is None:
return None
return self._client.lock_type
@property
def is_connected(self) -> bool:
return self._client is not None and self._client.is_connected
@property
def is_sleeping(self) -> bool:
return self._sleeping
@property
def mappings(self) -> Dict[int, LocalMapping]:
"""Read-only view of VA -> LocalMapping dictionary."""
return self._mappings
@property
def total_bytes(self) -> int:
"""Total bytes allocated across all mappings."""
return sum(m.aligned_size for m in self._mappings.values())
# ==================== Metadata convenience ====================
def metadata_put(
self, key: str, allocation_id: str, offset_bytes: int, value: bytes
) -> bool:
return self._client_rpc.metadata_put(key, allocation_id, offset_bytes, value)
def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
return self._client_rpc.metadata_get(key)
def metadata_list(self, prefix: str = "") -> List[str]:
return self._client_rpc.metadata_list(prefix)
def metadata_delete(self, key: str) -> bool:
return self._client_rpc.metadata_delete(key)
# ==================== Allocation operations ====================
def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
"""List all allocations on the server."""
return self._client_rpc.list_allocations(tag)
def allocate_and_map(self, size: int, tag: str = "default") -> int:
"""Allocate on server, reserve VA, and map locally.
Args:
size: Requested allocation size in bytes.
tag: Allocation tag for server tracking.
Returns:
Virtual address of the mapped allocation.
"""
self._require_rw()
client = self._client_rpc
aligned_size = align_to_granularity(size, self.granularity)
va = reserve_va(aligned_size, self.granularity)
try:
allocation_id, server_aligned = client.allocate(aligned_size, tag)
if int(server_aligned) != aligned_size:
raise RuntimeError(
f"Alignment mismatch: {aligned_size} vs {server_aligned}"
)
fd = client.export(allocation_id)
handle = import_handle_from_fd(fd)
map_to_va(va, aligned_size, handle)
set_access(va, aligned_size, self.device, GrantedLockType.RW)
self._track_mapping(
LocalMapping(
allocation_id=allocation_id,
va=va,
size=size,
aligned_size=aligned_size,
handle=handle,
tag=tag,
access=GrantedLockType.RW,
)
)
return va
except Exception:
free_va(va, aligned_size)
raise
def free_mapping(self, va: int) -> None:
"""Unmap and free a local mapping."""
mapping = self._mappings.pop(va, None)
if mapping is None:
return
self._allocation_id_to_va.pop(mapping.allocation_id, None)
try:
if mapping.handle != 0:
unmap(va, mapping.aligned_size)
release_handle(mapping.handle)
free_va(va, mapping.aligned_size)
except Exception as e:
logger.warning(f"Error freeing VA 0x{va:x}: {e}")
if self.lock_type == GrantedLockType.RW and not self._published:
try:
self._client_rpc.free(mapping.allocation_id)
except Exception:
pass
def import_allocation(self, allocation_id: str) -> int:
"""Import an existing allocation and map locally.
In RO mode, maps read-only. In RW mode, maps read-write.
"""
if allocation_id in self._allocation_id_to_va:
return self._allocation_id_to_va[allocation_id]
client = self._client_rpc
# lock_type is guaranteed non-None when connected (after _client_rpc succeeds)
assert self.lock_type is not None
current_access = self.lock_type
alloc_info = client.get_allocation(allocation_id)
aligned_size = int(alloc_info.aligned_size)
size = int(alloc_info.size)
tag = str(getattr(alloc_info, "tag", "default"))
va = reserve_va(aligned_size, self.granularity)
try:
fd = client.export(allocation_id)
handle = import_handle_from_fd(fd)
map_to_va(va, aligned_size, handle)
set_access(va, aligned_size, self.device, current_access)
self._track_mapping(
LocalMapping(
allocation_id=allocation_id,
va=va,
size=size,
aligned_size=aligned_size,
handle=handle,
tag=tag,
access=current_access,
)
)
return va
except Exception:
free_va(va, aligned_size)
raise
def clear_all(self) -> int:
"""Clear all allocations on the server (RW only). Local mappings are unmapped first."""
self._require_rw()
self._unmap_all()
return self._client_rpc.clear_all()
# ==================== Publish / mode switching ====================
def commit(self) -> bool:
"""Publish weights (RW only).
Client responsibilities:
- cudaDeviceSynchronize() before publishing
- flip local mappings to RO before publishing
Server responsibilities:
- transition to COMMITTED
- close the RW socket (publish + release)
"""
self._require_rw()
if torch.cuda.is_available():
torch.cuda.synchronize(self.device)
# After publishing, prevent further writes locally.
for va, m in list(self._mappings.items()):
if m.access != GrantedLockType.RO:
set_access(m.va, m.aligned_size, self.device, GrantedLockType.RO)
self._mappings[va] = m.with_access(GrantedLockType.RO)
ok = self._client_rpc.commit()
self._published = bool(ok)
# _client.commit() closes the socket on success; reflect that here.
if ok:
self._client = None
return bool(ok)
def switch_to_read(self, timeout_ms: Optional[int] = None) -> None:
"""Acquire an RO lock after publishing.
This is intended for the common flow where a writer loads weights and then
becomes a reader for inference.
"""
if self._closed:
raise RuntimeError("Memory manager is closed")
if self._sleeping:
raise RuntimeError(
"Cannot switch_to_read() while sleeping; call wake() first"
)
if self._client is not None:
if self.lock_type == GrantedLockType.RO:
return
raise RuntimeError(
"switch_to_read() requires the RW connection to be released (call commit() first)"
)
eff_timeout = timeout_ms if timeout_ms is not None else self._timeout_ms
self._connect(lock_type=RequestedLockType.RO, timeout_ms=eff_timeout)
# ==================== Sleep / wake (read mode) ====================
def sleep(self) -> None:
"""Release RO lock and unmap local allocations (VA-stable).
VAs are preserved during sleep so tensor pointers remain stable.
On wake, allocations are remapped to the same VAs.
"""
if self._closed:
raise RuntimeError("Memory manager is closed")
if self._sleeping:
return
if self.lock_type != GrantedLockType.RO:
raise RuntimeError("sleep() requires RO mode")
if torch.cuda.is_available():
torch.cuda.synchronize(self.device)
# Preserve allocation IDs for remapping on wake
self._preserved_allocation_ids = list(self._allocation_id_to_va.keys())
# Unmap physical memory but keep VA reservations
self._unmap_preserving_va()
self._va_preserved = True
self._client_rpc.close()
self._client = None
self._sleeping = True
def wake(self, timeout_ms: Optional[int] = None) -> bool:
"""Reacquire RO lock and remap preserved allocations (VA-stable).
Allocations are remapped to the same VAs they had before sleep,
ensuring tensor pointers remain valid.
Args:
timeout_ms: Timeout for RO lock acquisition.
Returns:
True on success.
Raises:
TimeoutError: If timeout_ms expires waiting for RO lock.
StaleMemoryLayoutError: If weights were structurally changed while sleeping.
"""
if self._closed:
raise RuntimeError("Memory manager is closed")
if not self._sleeping:
return True
if torch.cuda.is_available():
torch.cuda.set_device(self.device)
eff_timeout = timeout_ms if timeout_ms is not None else self._timeout_ms
self._connect(
lock_type=RequestedLockType.RO,
timeout_ms=eff_timeout,
update_memory_layout_hash=False,
)
# Check if memory layout changed while sleeping
current_hash = self._client_rpc.get_memory_layout_hash()
if (
self._last_memory_layout_hash
and current_hash != self._last_memory_layout_hash
):
raise StaleMemoryLayoutError(
f"State changed while sleeping: hash {self._last_memory_layout_hash[:16]}... -> {current_hash[:16]}..."
)
# Remap to preserved VAs
remapped_count = 0
failed_count = 0
total_bytes = 0
for alloc_id in self._preserved_allocation_ids:
try:
va = self._remap_preserved_va(alloc_id)
mapping = self._mappings.get(va)
if mapping:
total_bytes += mapping.aligned_size
remapped_count += 1
except StaleMemoryLayoutError:
raise # Let StaleMemoryLayoutError propagate
except Exception as e:
logger.warning(f"Failed to remap {alloc_id}: {e}")
failed_count += 1
if failed_count > 0:
raise RuntimeError(
f"Wake failed: {failed_count} of {len(self._preserved_allocation_ids)} "
f"allocations could not be remapped"
)
logger.info(
f"[GPU Memory Service] Wake complete on device {self.device}: "
f"remapped {remapped_count} allocations ({total_bytes / (1 << 30):.2f} GiB)"
)
self._sleeping = False
self._va_preserved = False
return True
# ==================== Cleanup ====================
def close(self) -> None:
if self._closed:
return
# Ensure kernels are done before tearing down mappings.
if torch.cuda.is_available():
torch.cuda.synchronize(self.device)
# Release all mappings including preserved VA reservations
self._unmap_all()
if self._client is not None:
self._client.close()
self._client = None
self._closed = True
self._sleeping = False
self._va_preserved = False
self._preserved_allocation_ids.clear()
def __enter__(self) -> "GMSClientMemoryManager":
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self.close()
# ==================== Internals ====================
@property
def _client_rpc(self) -> GMSRPCClient:
"""Get connected client or raise. Use instead of _require_connected() + assert."""
if self._client is None:
if self._sleeping:
raise RuntimeError("Memory manager is sleeping")
raise RuntimeError("Memory manager is not connected")
return self._client
def _require_rw(self) -> None:
"""Raise if not in RW mode."""
if self.lock_type != GrantedLockType.RW:
raise RuntimeError("Operation requires RW mode")
def _track_mapping(self, m: LocalMapping) -> None:
self._mappings[m.va] = m
self._allocation_id_to_va[m.allocation_id] = m.va
def _unmap_preserving_va(self) -> None:
"""Unmap physical memory but PRESERVE VA reservations for sleep/wake.
This keeps the VA reservation intact so tensors maintain stable pointers.
On wake, we can remap to the same VAs.
"""
unmapped_count = 0
total_bytes = 0
for va, mapping in list(self._mappings.items()):
if mapping.handle == 0:
continue # Already unmapped
try:
unmap(va, mapping.aligned_size)
release_handle(mapping.handle)
self._mappings[va] = mapping.with_handle(
0
) # Mark unmapped, VA reserved
unmapped_count += 1
total_bytes += mapping.aligned_size
except Exception as e:
logger.warning(
f"Error unmapping VA 0x{va:x} (preserving reservation): {e}"
)
logger.info(
f"[GPU Memory Service] Unmapped {unmapped_count} allocations ({total_bytes / (1 << 30):.2f} GiB), "
f"preserving {len(self._mappings)} VA reservations"
)
def _remap_preserved_va(self, allocation_id: str) -> int:
"""Remap an allocation to its preserved VA.
Requires the VA to already be reserved (from before sleep).
Validates allocation still exists and size matches.
Returns the VA.
Raises StaleMemoryLayoutError if allocation is missing or size changed.
"""
if torch.cuda.is_available():
torch.cuda.set_device(self.device)
va = self._allocation_id_to_va.get(allocation_id)
if va is None:
raise RuntimeError(f"No preserved VA for allocation {allocation_id}")
mapping = self._mappings.get(va)
if mapping is None:
raise RuntimeError(f"No mapping info for VA 0x{va:x}")
if mapping.handle != 0:
return va # Already mapped
client = self._client_rpc
# lock_type is guaranteed non-None when connected (after _client_rpc succeeds)
assert self.lock_type is not None
current_access = self.lock_type
# Validate allocation still exists and size matches
try:
alloc_info = client.get_allocation(allocation_id)
except Exception as e:
raise StaleMemoryLayoutError(
f"Allocation {allocation_id} no longer exists on server: {e}"
) from e
server_aligned_size = int(alloc_info.aligned_size)
if server_aligned_size != mapping.aligned_size:
raise StaleMemoryLayoutError(
f"Allocation {allocation_id} size changed: expected {mapping.aligned_size}, got {server_aligned_size}"
)
# Re-import the handle and map to the SAME VA (which is still reserved)
fd = client.export(allocation_id)
handle = import_handle_from_fd(fd)
map_to_va(va, mapping.aligned_size, handle)
# Set access permissions based on current lock type
set_access(va, mapping.aligned_size, self.device, current_access)
# Synchronize to ensure mapping is complete before any access
cuda.cuCtxSynchronize()
# Validate the pointer is accessible (this is what Triton checks)
result, _dev_ptr = cuda.cuPointerGetAttribute(
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER, va
)
if result != cuda.CUresult.CUDA_SUCCESS:
err_result, err_str = cuda.cuGetErrorString(result)
err_msg = ""
if err_result == cuda.CUresult.CUDA_SUCCESS and err_str:
err_msg = (
err_str.decode() if isinstance(err_str, bytes) else str(err_str)
)
logger.warning(
f"[GPU Memory Service] cuPointerGetAttribute failed for VA 0x{va:x} after remap: "
f"error {result} ({err_msg})"
)
else:
logger.debug(
f"[GPU Memory Service] Remapped VA 0x{va:x} validated OK (device={self.device})"
)
# Update mapping with new handle and access
updated = mapping.with_handle(handle)
self._mappings[va] = updated.with_access(current_access)
return va
def _unmap_all(self) -> None:
"""Unmap and release all local mappings including VA reservations."""
for va, mapping in list(self._mappings.items()):
try:
if mapping.handle != 0:
unmap(va, mapping.aligned_size)
release_handle(mapping.handle)
free_va(va, mapping.aligned_size)
except Exception as e:
logger.warning(f"Error unmapping VA 0x{va:x}: {e}")
self._mappings.clear()
self._allocation_id_to_va.clear()
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service RPC Client.
Low-level RPC client stub. The client provides a simple interface for acquiring
locks and performing allocation operations. The socket connection IS the lock.
This module has NO PyTorch dependency.
Usage:
# Writer (acquires RW lock in constructor)
with GMSRPCClient(socket_path, lock_type=RequestedLockType.RW) as client:
alloc_id, aligned_size = client.allocate(size=1024*1024)
fd = client.export(alloc_id)
# ... write weights using fd ...
client.commit()
# Lock released on exit
# Reader (acquires RO lock in constructor)
client = GMSRPCClient(socket_path, lock_type=RequestedLockType.RO)
if client.committed: # Check if weights are valid
allocations = client.list_allocations()
for alloc in allocations:
fd = client.export(alloc["allocation_id"])
# ... import and map fd ...
# Keep connection open during inference!
# client.close() only when done with inference
"""
import logging
import socket
from typing import Dict, List, Optional, Tuple, Type, TypeVar
from gpu_memory_service.common.protocol.messages import (
AllocateRequest,
AllocateResponse,
ClearAllRequest,
ClearAllResponse,
CommitRequest,
CommitResponse,
ErrorResponse,
ExportRequest,
FreeRequest,
FreeResponse,
GetAllocationRequest,
GetAllocationResponse,
GetAllocationStateRequest,
GetAllocationStateResponse,
GetLockStateRequest,
GetLockStateResponse,
GetStateHashRequest,
GetStateHashResponse,
HandshakeRequest,
HandshakeResponse,
ListAllocationsRequest,
ListAllocationsResponse,
MetadataDeleteRequest,
MetadataDeleteResponse,
MetadataGetRequest,
MetadataGetResponse,
MetadataListRequest,
MetadataListResponse,
MetadataPutRequest,
MetadataPutResponse,
)
from gpu_memory_service.common.protocol.wire import recv_message_sync, send_message_sync
from gpu_memory_service.common.types import (
RW_REQUIRED,
GrantedLockType,
RequestedLockType,
)
T = TypeVar("T")
logger = logging.getLogger(__name__)
class GMSRPCClient:
"""GPU Memory Service RPC Client.
CRITICAL: Socket connection IS the lock.
- Constructor blocks until lock is acquired
- close() releases the lock
- committed property tells readers if weights are valid
For writers (lock_type=RequestedLockType.RW):
- Use context manager (with statement) for automatic lock release
- Call commit() after weights are written
- Call clear_all() before loading new model
For readers (lock_type=RequestedLockType.RO):
- Check committed property after construction
- Keep connection open during inference lifetime
- Only call close() when shutting down or allowing weight updates
"""
def __init__(
self,
socket_path: str,
lock_type: RequestedLockType = RequestedLockType.RO,
timeout_ms: Optional[int] = None,
):
"""Connect to Allocation Server and acquire lock.
Args:
socket_path: Path to server's Unix domain socket
lock_type: Requested lock type (RW, RO, or RW_OR_RO)
timeout_ms: Timeout in milliseconds for lock acquisition.
None means wait indefinitely.
Raises:
ConnectionError: If connection fails
TimeoutError: If timeout_ms expires waiting for lock
"""
self.socket_path = socket_path
self._requested_lock_type = lock_type
self._socket: Optional[socket.socket] = None
self._recv_buffer = bytearray()
self._committed = False
self._granted_lock_type: Optional[GrantedLockType] = None
# Connect and acquire lock
self._connect(timeout_ms=timeout_ms)
def _connect(self, timeout_ms: Optional[int]) -> None:
"""Connect to server and perform handshake (lock acquisition)."""
self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
self._socket.connect(self.socket_path)
except FileNotFoundError:
raise ConnectionError(f"Server not running at {self.socket_path}") from None
except Exception as e:
raise ConnectionError(f"Failed to connect: {e}") from e
# Send handshake (this IS lock acquisition)
request = HandshakeRequest(
lock_type=self._requested_lock_type, timeout_ms=timeout_ms
)
send_message_sync(self._socket, request)
# Receive response (may block waiting for lock)
response, _, self._recv_buffer = recv_message_sync(
self._socket, self._recv_buffer
)
if isinstance(response, ErrorResponse):
self._socket.close()
self._socket = None
raise ConnectionError(f"Handshake error: {response.error}")
if not isinstance(response, HandshakeResponse):
self._socket.close()
self._socket = None
raise ConnectionError(f"Unexpected response: {type(response)}")
if not response.success:
self._socket.close()
self._socket = None
raise TimeoutError("Timeout waiting for lock")
self._committed = response.committed
# Store granted lock type (may differ from requested for rw_or_ro mode)
if response.granted_lock_type is not None:
self._granted_lock_type = response.granted_lock_type
elif self._requested_lock_type == RequestedLockType.RW:
self._granted_lock_type = GrantedLockType.RW
else:
self._granted_lock_type = GrantedLockType.RO
logger.info(
f"Connected with {self._requested_lock_type.value} lock (granted={self._granted_lock_type.value}), "
f"committed={self._committed}"
)
@property
def committed(self) -> bool:
"""Check if weights are committed (valid)."""
return self._committed
@property
def lock_type(self) -> Optional[GrantedLockType]:
"""Get the lock type actually granted by the server.
For rw_or_ro mode, this tells you whether RW or RO was granted.
"""
return self._granted_lock_type
@property
def is_connected(self) -> bool:
"""Check if client is connected."""
return self._socket is not None
def _send_recv(self, request) -> Tuple[object, int]:
"""Send request and receive response. Returns (response, fd)."""
if not self._socket:
raise RuntimeError("Client not connected")
send_message_sync(self._socket, request)
response, fd, self._recv_buffer = recv_message_sync(
self._socket, self._recv_buffer
)
if isinstance(response, ErrorResponse):
raise RuntimeError(f"Server error: {response.error}")
return response, fd
def _call(self, request, response_type: Type[T]) -> T:
"""Send request, validate response type, return typed response."""
if type(request) in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
raise RuntimeError("Operation requires RW connection")
response, _ = self._send_recv(request)
if not isinstance(response, response_type):
raise RuntimeError(f"Unexpected response: {type(response)}")
return response
def get_lock_state(self) -> GetLockStateResponse:
return self._call(GetLockStateRequest(), GetLockStateResponse)
def get_allocation_state(self) -> GetAllocationStateResponse:
return self._call(GetAllocationStateRequest(), GetAllocationStateResponse)
def is_ready(self) -> bool:
return self.committed
def commit(self) -> bool:
"""Commit weights and release RW lock. Returns True on success."""
if CommitRequest in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
raise RuntimeError("Operation requires RW connection")
try:
response, _ = self._send_recv(CommitRequest())
ok = isinstance(response, CommitResponse) and response.success
except (ConnectionResetError, BrokenPipeError, OSError) as e:
# Server closes RW socket as part of commit
logger.debug(
f"Commit saw socket error ({type(e).__name__}); verifying via RO connect"
)
self.close()
try:
ro = GMSRPCClient(
self.socket_path, lock_type=RequestedLockType.RO, timeout_ms=1000
)
try:
ok = ro.committed
finally:
ro.close()
except TimeoutError:
ok = False
if ok:
self._committed = True
self.close()
logger.info("Committed weights and released RW connection")
return True
return False
def allocate(self, size: int, tag: str = "default") -> Tuple[str, int]:
"""Returns (allocation_id, aligned_size)."""
r = self._call(AllocateRequest(size=size, tag=tag), AllocateResponse)
return r.allocation_id, r.aligned_size
def export(self, allocation_id: str) -> int:
"""Export allocation as POSIX FD. Caller must close."""
_, fd = self._send_recv(ExportRequest(allocation_id=allocation_id))
if fd < 0:
raise RuntimeError("No FD received from server")
return fd
def get_allocation(self, allocation_id: str) -> GetAllocationResponse:
return self._call(
GetAllocationRequest(allocation_id=allocation_id), GetAllocationResponse
)
def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
return self._call(
ListAllocationsRequest(tag=tag), ListAllocationsResponse
).allocations
def free(self, allocation_id: str) -> bool:
return self._call(
FreeRequest(allocation_id=allocation_id), FreeResponse
).success
def clear_all(self) -> int:
return self._call(ClearAllRequest(), ClearAllResponse).cleared_count
def metadata_put(
self, key: str, allocation_id: str, offset_bytes: int, value: bytes
) -> bool:
req = MetadataPutRequest(
key=key, allocation_id=allocation_id, offset_bytes=offset_bytes, value=value
)
return self._call(req, MetadataPutResponse).success
def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
"""Returns (allocation_id, offset_bytes, value) or None if not found."""
r = self._call(MetadataGetRequest(key=key), MetadataGetResponse)
return (r.allocation_id, r.offset_bytes, r.value) if r.found else None
def metadata_delete(self, key: str) -> bool:
return self._call(
MetadataDeleteRequest(key=key), MetadataDeleteResponse
).deleted
def metadata_list(self, prefix: str = "") -> List[str]:
return self._call(MetadataListRequest(prefix=prefix), MetadataListResponse).keys
def get_memory_layout_hash(self) -> str:
"""Get state hash (hash of allocations + metadata). Empty if not committed."""
return self._call(
GetStateHashRequest(), GetStateHashResponse
).memory_layout_hash
def close(self) -> None:
"""Close connection and release lock."""
if self._socket:
try:
self._socket.close()
except Exception:
pass
self._socket = None
lock_str = self.lock_type.value if self.lock_type else "unknown"
logger.info(f"Closed {lock_str} connection")
def __enter__(self) -> "GMSRPCClient":
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit."""
self.close()
def __del__(self):
"""Destructor: warn if connection not closed."""
if self._socket:
logger.warning("GMSRPCClient not closed properly")
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""PyTorch integration for GPU Memory Service.
This module provides PyTorch-specific functionality:
- Memory manager singleton management
- Tensor utilities (metadata, registration, materialization)
- C++ extension for CUDAPluggableAllocator
"""
from gpu_memory_service.client.torch.allocator import (
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor operations (public API)
"register_module_tensors",
"materialize_module_from_gms",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment