feat: GPU Memory Service (#5286)

Signed-off-by: Schwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>

feat: GPU Memory Service (#5286)
Signed-off-by: Schwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
30c6228b · Schwinn Saereesitthipitak · GitHub · cde3b2a5 · 30c6228b · 30c6228b
Unverified Commit 30c6228b authored Jan 23, 2026 by Schwinn Saereesitthipitak Committed by GitHub Jan 23, 2026
20 changed files
--- a/.devcontainer/post-create.sh
+++ b/.devcontainer/post-create.sh
@@ -111,7 +111,8 @@ $SANITY_STATUS
 Now build the project:
  cargo build --locked --profile dev --features dynamo-llm/block-manager
  cd lib/bindings/python && maturin develop --uv
-  DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
+  uv pip install -e lib/gpu_memory_service  # GPU memory manager with C++ extension
+  DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
 Optional: cd lib/bindings/kvbm && maturin develop --uv  # For KVBM support

--- a/.dockerignore
+++ b/.dockerignore
@@ -45,6 +45,10 @@ container/Dockerfile*
 .venv
 .venv-docs
+# GPU Memory Service build artifacts
+lib/gpu_memory_service/build/
+lib/gpu_memory_service/*.egg-info/
+lib/gpu_memory_service/**/*.so
 # Python
 __pycache__/

--- a/.github/filters.yaml
+++ b/.github/filters.yaml
@@ -78,6 +78,7 @@ core:
  - 'components/src/dynamo/mocker/**'
  - 'components/src/dynamo/frontend/**'
  - 'components/src/dynamo/common/**'
+  - 'components/src/dynamo/gpu_memory_service/**'
  - '*.toml'
  - '*.lock'
  - '*.py'

--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
 tensorrtllm_engines/
 api_server_models/
 server/
+!lib/gpu_memory_service/server/
 # Replay/Snapshot test artifacts
 *.new
 lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/

--- a/README.md
+++ b/README.md
@@ -331,7 +331,16 @@ cd lib/bindings/python
 maturin develop --uv
 ```
-## 6. Install the Wheel
+## 6. Install GPU Memory Service
+The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
+```bash
+cd $PROJECT_ROOT
+uv pip install -e lib/gpu_memory_service
+```
+## 7. Install the Wheel
 ```
 cd $PROJECT_ROOT

--- a/components/src/dynamo/gpu_memory_service/__init__.py
+++ b/components/src/dynamo/gpu_memory_service/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service component for Dynamo.
+This module provides the Dynamo component wrapper around the gpu_memory_service package.
+The core functionality is in the gpu_memory_service package; this module provides:
+- CLI entry point (python -m dynamo.gpu_memory_service)
+- Re-exports for backwards compatibility
+"""
+# Re-export core functionality from gpu_memory_service package
+from gpu_memory_service import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+# Re-export extensions (built separately)
+try:
+    from gpu_memory_service.client.torch.extensions import _allocator_ext
+except (ImportError, OSError):
+    _allocator_ext = None
+# Re-export module utilities
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+__all__ = [
+    # Core
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor utilities
+    "register_module_tensors",
+    "materialize_module_from_gms",
+    # Extensions
+    "_allocator_ext",
+]
--- a/components/src/dynamo/gpu_memory_service/__main__.py
+++ b/components/src/dynamo/gpu_memory_service/__main__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from dynamo.gpu_memory_service.server import main
+if __name__ == "__main__":
+    main()
--- a/components/src/dynamo/gpu_memory_service/args.py
+++ b/components/src/dynamo/gpu_memory_service/args.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Argument parsing for GPU Memory Service server component."""
+import argparse
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class Config:
+    """Configuration for GPU Memory Service server."""
+    # GPU Memory Service specific
+    device: int
+    socket_path: str
+    verbose: bool
+def parse_args() -> Config:
+    """Parse command line arguments for GPU Memory Service server."""
+    parser = argparse.ArgumentParser(
+        description="GPU Memory Service allocation server for Dynamo."
+    )
+    # GPU Memory Service specific arguments
+    parser.add_argument(
+        "--device",
+        type=int,
+        required=True,
+        help="CUDA device ID to manage memory for.",
+    )
+    parser.add_argument(
+        "--socket-path",
+        type=str,
+        default=None,
+        help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
+        "Supports {device} placeholder for multi-GPU setups.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging.",
+    )
+    args = parser.parse_args()
+    # Generate default socket path if not provided
+    socket_path = args.socket_path
+    if socket_path is None:
+        socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
+    else:
+        # Expand {device} placeholder
+        socket_path = socket_path.format(device=args.device)
+    config = Config(
+        device=args.device,
+        socket_path=socket_path,
+        verbose=args.verbose,
+    )
+    return config
--- a/components/src/dynamo/gpu_memory_service/server.py
+++ b/components/src/dynamo/gpu_memory_service/server.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service allocation server component for Dynamo.
+This component wraps the GMSRPCServer from gpu_memory_service to manage
+GPU memory allocations with connection-based RW/RO locking.
+Workers connect via the socket path, which should be passed to vLLM/SGLang via:
+    --load-format gpu_memory_service
+    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+Usage:
+    python -m dynamo.gpu_memory_service --device 0
+    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+"""
+import asyncio
+import logging
+import signal
+import uvloop
+from gpu_memory_service.server import GMSRPCServer
+from .args import parse_args
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+async def worker() -> None:
+    """Main async worker function."""
+    config = parse_args()
+    # Configure logging level
+    if config.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
+    logger.info(f"Socket path: {config.socket_path}")
+    server = GMSRPCServer(config.socket_path, device=config.device)
+    # Set up shutdown handling
+    shutdown_event = asyncio.Event()
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        shutdown_event.set()
+    loop = asyncio.get_running_loop()
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+    await server.start()
+    logger.info("GPU Memory Service Server ready, waiting for connections...")
+    logger.info(
+        f"To connect vLLM workers, use: --load-format gpu_memory_service "
+        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
+    )
+    # Wait for shutdown signal
+    try:
+        await shutdown_event.wait()
+    finally:
+        logger.info("Shutting down GPU Memory Service Server...")
+        await server.stop()
+        logger.info("GPU Memory Service Server shutdown complete")
+def main() -> None:
+    """Entry point for GPU Memory Service server."""
+    uvloop.install()
+    asyncio.run(worker())
+if __name__ == "__main__":
+    main()
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##############################################
 ########## Runtime image ##############
 ##############################################
@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
 # Install dynamo wheels (runtime packages only, no test dependencies)
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
        --requirement /tmp/requirements.test.txt
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##################################
 ########## Runtime Image #########
 ##################################
@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
 ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
 # Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN --mount=type=bind,source=.,target=/mnt/local_src \
    pip install --no-cache-dir --break-system-packages \
        /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
        /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
        /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-        sglang==${SGLANG_VERSION}
+        sglang==${SGLANG_VERSION} && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
+    fi
 # Install common and test dependencies as root
 RUN --mount=type=bind,source=.,target=/mnt/local_src \

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##################################################
 ########## Framework Builder Stage ##############
 ##################################################
@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
 # Install dynamo, NIXL, and dynamo-specific dependencies
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
 RUN uv pip install \
      --no-cache \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install --no-cache "$GMS_WHEEL"; \
+    fi && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ########################################################
 ########## Framework Development Image ################
 ########################################################
@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin
 COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
 COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
 COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
+COPY --from=dynamo_base /usr/local/cuda/lib64/stubs/ /usr/local/cuda/lib64/stubs/
 RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
    ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
    ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so
@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
 # Install dynamo, NIXL, and dynamo-specific dependencies
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
 RUN uv pip install \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \
    ldconfig
 USER dynamo
 ARG DYNAMO_COMMIT_SHA
 ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA

--- a/container/build.sh
+++ b/container/build.sh
@@ -156,6 +156,10 @@ PUSH=""
 # or can be explicitly enabled via --enable-kvbm flag
 ENABLE_KVBM=false
+# GPU Memory Service - default disabled, enabled automatically for VLLM/SGLANG
+# or can be explicitly enabled via --enable-gpu-memory-service flag
+ENABLE_GPU_MEMORY_SERVICE=false
 # sccache configuration for S3
 USE_SCCACHE=""
 SCCACHE_BUCKET=""
@@ -343,6 +347,9 @@ get_options() {
        --enable-kvbm)
            ENABLE_KVBM=true
            ;;
+        --enable-gpu-memory-service)
+            ENABLE_GPU_MEMORY_SERVICE=true
+            ;;
        --enable-media-nixl)
            ENABLE_MEDIA_NIXL=true
            ;;
@@ -539,6 +546,7 @@ show_help() {
    echo "  [--release-build perform a release build]"
    echo "  [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]"
    echo "  [--enable-kvbm Enables KVBM support in Python 3.12]"
+    echo "  [--enable-gpu-memory-service Enables GPU Memory Service support]"
    echo "  [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]"
    echo "  [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]"
    echo "  [--use-sccache enable sccache for Rust/C/C++ compilation caching]"
@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then
    BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} "
 fi
+# ENABLE_GPU_MEMORY_SERVICE: Used in Dockerfiles for gpu_memory_service wheel.
+#                            Declared but not currently used in Dockerfile.trtllm.
+# Force GPU Memory Service to be enabled for VLLM and SGLANG frameworks
+if [[ $FRAMEWORK == "VLLM" ]] || [[ $FRAMEWORK == "SGLANG" ]]; then
+    echo "Forcing enable_gpu_memory_service to true in ${FRAMEWORK} image build"
+    ENABLE_GPU_MEMORY_SERVICE=true
+fi
+# For other frameworks, ENABLE_GPU_MEMORY_SERVICE defaults to false unless --enable-gpu-memory-service flag was provided
+if [[ ${ENABLE_GPU_MEMORY_SERVICE} == "true" ]]; then
+    echo "Enabling GPU Memory Service in the dynamo image"
+    BUILD_ARGS+=" --build-arg ENABLE_GPU_MEMORY_SERVICE=${ENABLE_GPU_MEMORY_SERVICE} "
+fi
 # ENABLE_MEDIA_NIXL: Enable media processing with NIXL support
 # Used in base Dockerfile for maturin build feature flag.
 # Can be explicitly overridden with --enable-media-nixl flag

--- a/lib/gpu_memory_service/__init__.py
+++ b/lib/gpu_memory_service/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service - out-of-process GPU memory manager.
+The GPU Memory Service decouples ownership of GPU memory from the processes
+that use it, enabling zero-copy sharing and data survival across process crashes.
+Package structure:
+- common/: Shared types and protocol (used by both server and client)
+- server/: Allocation server daemon (no CUDA context required)
+- client/: Client library for memory management
+  - client/torch/: PyTorch integration (allocator, tensor, module, extensions)
+Primary client API:
+    from gpu_memory_service import (
+        GMSClientMemoryManager,
+        get_or_create_gms_client_memory_manager,
+        get_gms_client_memory_manager,
+    )
+Server API:
+    from gpu_memory_service.server import GMSRPCServer
+"""
+# Primary client exports
+from gpu_memory_service.client.memory_manager import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+)
+# PyTorch integration (GMS client memory manager)
+from gpu_memory_service.client.torch.allocator import (
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+__all__ = [
+    # Client
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+]
--- a/lib/gpu_memory_service/client/__init__.py
+++ b/lib/gpu_memory_service/client/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service client library.
+This module provides the client-side components for interacting with the
+GPU Memory Service:
+- GMSClientMemoryManager: Manages local VA mappings of remote GPU memory
+- GMSRPCClient: Low-level RPC client (pure Python, no PyTorch dependency)
+For PyTorch integration (MemPool, tensor utilities), see gpu_memory_service.client.torch.
+"""
+from gpu_memory_service.client.memory_manager import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+)
+from gpu_memory_service.client.rpc import GMSRPCClient
+__all__ = [
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    "GMSRPCClient",
+]
--- a/lib/gpu_memory_service/client/cuda_vmm_utils.py
+++ b/lib/gpu_memory_service/client/cuda_vmm_utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Client-side CUDA VMM utilities.
+These functions wrap CUDA driver API calls used by the client memory manager
+for importing, mapping, and unmapping GPU memory.
+"""
+from __future__ import annotations
+from cuda.bindings import driver as cuda
+from gpu_memory_service.common.cuda_vmm_utils import check_cuda_result
+from gpu_memory_service.common.types import GrantedLockType
+def import_handle_from_fd(fd: int) -> int:
+    """Import a CUDA memory handle from a file descriptor.
+    Args:
+        fd: POSIX file descriptor received via SCM_RIGHTS.
+    Returns:
+        CUDA memory handle.
+    """
+    result, handle = cuda.cuMemImportFromShareableHandle(
+        fd,
+        cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+    )
+    check_cuda_result(result, "cuMemImportFromShareableHandle")
+    return int(handle)
+def reserve_va(size: int, granularity: int) -> int:
+    """Reserve virtual address space.
+    Args:
+        size: Size in bytes (should be aligned to granularity).
+        granularity: VMM allocation granularity.
+    Returns:
+        Reserved virtual address.
+    """
+    result, va = cuda.cuMemAddressReserve(size, granularity, 0, 0)
+    check_cuda_result(result, "cuMemAddressReserve")
+    return int(va)
+def free_va(va: int, size: int) -> None:
+    """Free a virtual address reservation.
+    Args:
+        va: Virtual address to free.
+        size: Size of the reservation.
+    """
+    (result,) = cuda.cuMemAddressFree(va, size)
+    check_cuda_result(result, "cuMemAddressFree")
+def map_to_va(va: int, size: int, handle: int) -> None:
+    """Map a CUDA handle to a virtual address.
+    Args:
+        va: Virtual address (must be reserved).
+        size: Size of the mapping.
+        handle: CUDA memory handle.
+    """
+    (result,) = cuda.cuMemMap(va, size, 0, handle, 0)
+    check_cuda_result(result, "cuMemMap")
+def set_access(va: int, size: int, device: int, access: GrantedLockType) -> None:
+    """Set access permissions for a mapped region.
+    Args:
+        va: Virtual address.
+        size: Size of the region.
+        device: CUDA device index.
+        access: Access mode - RO for read-only, RW for read-write.
+    """
+    acc = cuda.CUmemAccessDesc()
+    acc.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    acc.location.id = device
+    acc.flags = (
+        cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READ
+        if access == GrantedLockType.RO
+        else cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+    )
+    (result,) = cuda.cuMemSetAccess(va, size, [acc], 1)
+    check_cuda_result(result, "cuMemSetAccess")
+def unmap(va: int, size: int) -> None:
+    """Unmap a virtual address region.
+    Args:
+        va: Virtual address to unmap.
+        size: Size of the mapping.
+    """
+    (result,) = cuda.cuMemUnmap(va, size)
+    check_cuda_result(result, "cuMemUnmap")
+def release_handle(handle: int) -> None:
+    """Release a CUDA memory handle.
+    Args:
+        handle: CUDA memory handle to release.
+    """
+    (result,) = cuda.cuMemRelease(handle)
+    check_cuda_result(result, "cuMemRelease")
--- a/lib/gpu_memory_service/client/memory_manager.py
+++ b/lib/gpu_memory_service/client/memory_manager.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service client-side memory manager.
+This is the unified memory manager for the GPU Memory Service architecture.
+Key properties:
+- Uses GMSRPCClient over a Unix-domain socket.
+- The socket connection itself is the RW/RO lock.
+- In write mode, the manager can allocate + map RW and then publish via commit().
+- In read mode, the manager can import + map RO and hold the RO lock during inference.
+- sleep()/wake() releases and reacquires the RO lock (and remaps allocations).
+This module uses cuda-python bindings for CUDA driver API calls:
+- import FDs (cuMemImportFromShareableHandle)
+- reserve VA (cuMemAddressReserve)
+- map/unmap (cuMemMap/cuMemUnmap)
+- enforce access (cuMemSetAccess)
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+import torch
+from cuda.bindings import driver as cuda
+from gpu_memory_service.client.cuda_vmm_utils import (
+    free_va,
+    import_handle_from_fd,
+    map_to_va,
+    release_handle,
+    reserve_va,
+    set_access,
+    unmap,
+)
+from gpu_memory_service.client.rpc import GMSRPCClient
+from gpu_memory_service.common.cuda_vmm_utils import (
+    align_to_granularity,
+    get_allocation_granularity,
+)
+from gpu_memory_service.common.types import GrantedLockType, RequestedLockType
+logger = logging.getLogger(__name__)
+class StaleMemoryLayoutError(Exception):
+    """Raised when memory layout was modified while sleeping.
+    This error indicates that a writer acquired the RW lock and changed the
+    allocation structure (different sizes, different tensor layouts) while this
+    reader was sleeping. The caller should re-import the model from scratch.
+    IMPORTANT: This is a LAYOUT check, NOT a CONTENT check.
+    - Detected: Allocation sizes changed, tensors added/removed, metadata structure changed
+    - NOT detected: Weight values modified in-place
+    This design is intentional: sleep/wake enables use cases like RL training
+    where another process can write to the same memory locations (e.g., updating
+    weights) while preserving the structure. As long as the layout (allocation
+    and metadata table hashes) remains identical, wake() succeeds.
+    """
+    pass
+@dataclass(frozen=True)
+class LocalMapping:
+    """Immutable record of a local VA mapping."""
+    allocation_id: str
+    va: int
+    size: int
+    aligned_size: int
+    handle: int  # 0 if unmapped but VA reserved
+    tag: str
+    access: GrantedLockType
+    def with_handle(self, handle: int) -> "LocalMapping":
+        return LocalMapping(
+            self.allocation_id,
+            self.va,
+            self.size,
+            self.aligned_size,
+            handle,
+            self.tag,
+            self.access,
+        )
+    def with_access(self, access: GrantedLockType) -> "LocalMapping":
+        return LocalMapping(
+            self.allocation_id,
+            self.va,
+            self.size,
+            self.aligned_size,
+            self.handle,
+            self.tag,
+            access,
+        )
+class GMSClientMemoryManager:
+    """Unified memory manager that can act as writer or reader.
+    Modes:
+    - mode=RequestedLockType.RW: acquire RW lock, allocate/map RW, mutate metadata, commit/publish.
+    - mode=RequestedLockType.RO: acquire RO lock (READY only), import/map RO, sleep/wake.
+    - mode=RequestedLockType.RW_OR_RO: try RW if available, else wait for RO.
+    """
+    def __init__(
+        self,
+        socket_path: str,
+        *,
+        mode: RequestedLockType,
+        device: int = 0,
+        timeout_ms: Optional[int] = None,
+    ) -> None:
+        self.socket_path = socket_path
+        self.device = device
+        self._timeout_ms = timeout_ms
+        self._client: Optional[GMSRPCClient] = None
+        self._mappings: Dict[int, LocalMapping] = {}  # va -> mapping
+        self._allocation_id_to_va: Dict[str, int] = {}
+        self._sleeping = False
+        self._closed = False
+        self._preserved_allocation_ids: List[str] = []
+        self._published = False
+        self._mode: Optional[GrantedLockType] = None  # Updated by _connect
+        # VA-stable sleep/wake state
+        self._va_preserved = False
+        self._last_memory_layout_hash: str = (
+            ""  # Hash from server, saved on connect/commit
+        )
+        # Ensure torch is on the right device for subsequent CUDA operations.
+        if torch.cuda.is_available():
+            torch.cuda.set_device(self.device)
+        # Cache granularity for VA alignment
+        self.granularity = get_allocation_granularity(device)
+        self._connect(lock_type=mode, timeout_ms=timeout_ms)
+    def _connect(
+        self,
+        *,
+        lock_type: RequestedLockType,
+        timeout_ms: Optional[int],
+        update_memory_layout_hash: bool = True,
+    ) -> None:
+        self._client = GMSRPCClient(
+            self.socket_path, lock_type=lock_type, timeout_ms=timeout_ms
+        )
+        self._sleeping = False
+        # Update mode based on granted lock type (may differ from requested for rw_or_ro)
+        self._mode = self._client.lock_type
+        # Save state hash for stale detection on wake (skip during wake itself)
+        if update_memory_layout_hash and self._client.committed:
+            self._last_memory_layout_hash = self._client.get_memory_layout_hash()
+    @property
+    def mode(self) -> Optional[GrantedLockType]:
+        """Current mode of the memory manager."""
+        return self._mode
+    @property
+    def lock_type(self) -> Optional[GrantedLockType]:
+        """Get the lock type actually granted by the server."""
+        if self._client is None:
+            return None
+        return self._client.lock_type
+    @property
+    def is_connected(self) -> bool:
+        return self._client is not None and self._client.is_connected
+    @property
+    def is_sleeping(self) -> bool:
+        return self._sleeping
+    @property
+    def mappings(self) -> Dict[int, LocalMapping]:
+        """Read-only view of VA -> LocalMapping dictionary."""
+        return self._mappings
+    @property
+    def total_bytes(self) -> int:
+        """Total bytes allocated across all mappings."""
+        return sum(m.aligned_size for m in self._mappings.values())
+    # ==================== Metadata convenience ====================
+    def metadata_put(
+        self, key: str, allocation_id: str, offset_bytes: int, value: bytes
+    ) -> bool:
+        return self._client_rpc.metadata_put(key, allocation_id, offset_bytes, value)
+    def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
+        return self._client_rpc.metadata_get(key)
+    def metadata_list(self, prefix: str = "") -> List[str]:
+        return self._client_rpc.metadata_list(prefix)
+    def metadata_delete(self, key: str) -> bool:
+        return self._client_rpc.metadata_delete(key)
+    # ==================== Allocation operations ====================
+    def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
+        """List all allocations on the server."""
+        return self._client_rpc.list_allocations(tag)
+    def allocate_and_map(self, size: int, tag: str = "default") -> int:
+        """Allocate on server, reserve VA, and map locally.
+        Args:
+            size: Requested allocation size in bytes.
+            tag: Allocation tag for server tracking.
+        Returns:
+            Virtual address of the mapped allocation.
+        """
+        self._require_rw()
+        client = self._client_rpc
+        aligned_size = align_to_granularity(size, self.granularity)
+        va = reserve_va(aligned_size, self.granularity)
+        try:
+            allocation_id, server_aligned = client.allocate(aligned_size, tag)
+            if int(server_aligned) != aligned_size:
+                raise RuntimeError(
+                    f"Alignment mismatch: {aligned_size} vs {server_aligned}"
+                )
+            fd = client.export(allocation_id)
+            handle = import_handle_from_fd(fd)
+            map_to_va(va, aligned_size, handle)
+            set_access(va, aligned_size, self.device, GrantedLockType.RW)
+            self._track_mapping(
+                LocalMapping(
+                    allocation_id=allocation_id,
+                    va=va,
+                    size=size,
+                    aligned_size=aligned_size,
+                    handle=handle,
+                    tag=tag,
+                    access=GrantedLockType.RW,
+                )
+            )
+            return va
+        except Exception:
+            free_va(va, aligned_size)
+            raise
+    def free_mapping(self, va: int) -> None:
+        """Unmap and free a local mapping."""
+        mapping = self._mappings.pop(va, None)
+        if mapping is None:
+            return
+        self._allocation_id_to_va.pop(mapping.allocation_id, None)
+        try:
+            if mapping.handle != 0:
+                unmap(va, mapping.aligned_size)
+                release_handle(mapping.handle)
+            free_va(va, mapping.aligned_size)
+        except Exception as e:
+            logger.warning(f"Error freeing VA 0x{va:x}: {e}")
+        if self.lock_type == GrantedLockType.RW and not self._published:
+            try:
+                self._client_rpc.free(mapping.allocation_id)
+            except Exception:
+                pass
+    def import_allocation(self, allocation_id: str) -> int:
+        """Import an existing allocation and map locally.
+        In RO mode, maps read-only. In RW mode, maps read-write.
+        """
+        if allocation_id in self._allocation_id_to_va:
+            return self._allocation_id_to_va[allocation_id]
+        client = self._client_rpc
+        # lock_type is guaranteed non-None when connected (after _client_rpc succeeds)
+        assert self.lock_type is not None
+        current_access = self.lock_type
+        alloc_info = client.get_allocation(allocation_id)
+        aligned_size = int(alloc_info.aligned_size)
+        size = int(alloc_info.size)
+        tag = str(getattr(alloc_info, "tag", "default"))
+        va = reserve_va(aligned_size, self.granularity)
+        try:
+            fd = client.export(allocation_id)
+            handle = import_handle_from_fd(fd)
+            map_to_va(va, aligned_size, handle)
+            set_access(va, aligned_size, self.device, current_access)
+            self._track_mapping(
+                LocalMapping(
+                    allocation_id=allocation_id,
+                    va=va,
+                    size=size,
+                    aligned_size=aligned_size,
+                    handle=handle,
+                    tag=tag,
+                    access=current_access,
+                )
+            )
+            return va
+        except Exception:
+            free_va(va, aligned_size)
+            raise
+    def clear_all(self) -> int:
+        """Clear all allocations on the server (RW only). Local mappings are unmapped first."""
+        self._require_rw()
+        self._unmap_all()
+        return self._client_rpc.clear_all()
+    # ==================== Publish / mode switching ====================
+    def commit(self) -> bool:
+        """Publish weights (RW only).
+        Client responsibilities:
+        - cudaDeviceSynchronize() before publishing
+        - flip local mappings to RO before publishing
+        Server responsibilities:
+        - transition to COMMITTED
+        - close the RW socket (publish + release)
+        """
+        self._require_rw()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(self.device)
+        # After publishing, prevent further writes locally.
+        for va, m in list(self._mappings.items()):
+            if m.access != GrantedLockType.RO:
+                set_access(m.va, m.aligned_size, self.device, GrantedLockType.RO)
+                self._mappings[va] = m.with_access(GrantedLockType.RO)
+        ok = self._client_rpc.commit()
+        self._published = bool(ok)
+        # _client.commit() closes the socket on success; reflect that here.
+        if ok:
+            self._client = None
+        return bool(ok)
+    def switch_to_read(self, timeout_ms: Optional[int] = None) -> None:
+        """Acquire an RO lock after publishing.
+        This is intended for the common flow where a writer loads weights and then
+        becomes a reader for inference.
+        """
+        if self._closed:
+            raise RuntimeError("Memory manager is closed")
+        if self._sleeping:
+            raise RuntimeError(
+                "Cannot switch_to_read() while sleeping; call wake() first"
+            )
+        if self._client is not None:
+            if self.lock_type == GrantedLockType.RO:
+                return
+            raise RuntimeError(
+                "switch_to_read() requires the RW connection to be released (call commit() first)"
+            )
+        eff_timeout = timeout_ms if timeout_ms is not None else self._timeout_ms
+        self._connect(lock_type=RequestedLockType.RO, timeout_ms=eff_timeout)
+    # ==================== Sleep / wake (read mode) ====================
+    def sleep(self) -> None:
+        """Release RO lock and unmap local allocations (VA-stable).
+        VAs are preserved during sleep so tensor pointers remain stable.
+        On wake, allocations are remapped to the same VAs.
+        """
+        if self._closed:
+            raise RuntimeError("Memory manager is closed")
+        if self._sleeping:
+            return
+        if self.lock_type != GrantedLockType.RO:
+            raise RuntimeError("sleep() requires RO mode")
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(self.device)
+        # Preserve allocation IDs for remapping on wake
+        self._preserved_allocation_ids = list(self._allocation_id_to_va.keys())
+        # Unmap physical memory but keep VA reservations
+        self._unmap_preserving_va()
+        self._va_preserved = True
+        self._client_rpc.close()
+        self._client = None
+        self._sleeping = True
+    def wake(self, timeout_ms: Optional[int] = None) -> bool:
+        """Reacquire RO lock and remap preserved allocations (VA-stable).
+        Allocations are remapped to the same VAs they had before sleep,
+        ensuring tensor pointers remain valid.
+        Args:
+            timeout_ms: Timeout for RO lock acquisition.
+        Returns:
+            True on success.
+        Raises:
+            TimeoutError: If timeout_ms expires waiting for RO lock.
+            StaleMemoryLayoutError: If weights were structurally changed while sleeping.
+        """
+        if self._closed:
+            raise RuntimeError("Memory manager is closed")
+        if not self._sleeping:
+            return True
+        if torch.cuda.is_available():
+            torch.cuda.set_device(self.device)
+        eff_timeout = timeout_ms if timeout_ms is not None else self._timeout_ms
+        self._connect(
+            lock_type=RequestedLockType.RO,
+            timeout_ms=eff_timeout,
+            update_memory_layout_hash=False,
+        )
+        # Check if memory layout changed while sleeping
+        current_hash = self._client_rpc.get_memory_layout_hash()
+        if (
+            self._last_memory_layout_hash
+            and current_hash != self._last_memory_layout_hash
+        ):
+            raise StaleMemoryLayoutError(
+                f"State changed while sleeping: hash {self._last_memory_layout_hash[:16]}... -> {current_hash[:16]}..."
+            )
+        # Remap to preserved VAs
+        remapped_count = 0
+        failed_count = 0
+        total_bytes = 0
+        for alloc_id in self._preserved_allocation_ids:
+            try:
+                va = self._remap_preserved_va(alloc_id)
+                mapping = self._mappings.get(va)
+                if mapping:
+                    total_bytes += mapping.aligned_size
+                remapped_count += 1
+            except StaleMemoryLayoutError:
+                raise  # Let StaleMemoryLayoutError propagate
+            except Exception as e:
+                logger.warning(f"Failed to remap {alloc_id}: {e}")
+                failed_count += 1
+        if failed_count > 0:
+            raise RuntimeError(
+                f"Wake failed: {failed_count} of {len(self._preserved_allocation_ids)} "
+                f"allocations could not be remapped"
+            )
+        logger.info(
+            f"[GPU Memory Service] Wake complete on device {self.device}: "
+            f"remapped {remapped_count} allocations ({total_bytes / (1 << 30):.2f} GiB)"
+        )
+        self._sleeping = False
+        self._va_preserved = False
+        return True
+    # ==================== Cleanup ====================
+    def close(self) -> None:
+        if self._closed:
+            return
+        # Ensure kernels are done before tearing down mappings.
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(self.device)
+        # Release all mappings including preserved VA reservations
+        self._unmap_all()
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+        self._closed = True
+        self._sleeping = False
+        self._va_preserved = False
+        self._preserved_allocation_ids.clear()
+    def __enter__(self) -> "GMSClientMemoryManager":
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
+    # ==================== Internals ====================
+    @property
+    def _client_rpc(self) -> GMSRPCClient:
+        """Get connected client or raise. Use instead of _require_connected() + assert."""
+        if self._client is None:
+            if self._sleeping:
+                raise RuntimeError("Memory manager is sleeping")
+            raise RuntimeError("Memory manager is not connected")
+        return self._client
+    def _require_rw(self) -> None:
+        """Raise if not in RW mode."""
+        if self.lock_type != GrantedLockType.RW:
+            raise RuntimeError("Operation requires RW mode")
+    def _track_mapping(self, m: LocalMapping) -> None:
+        self._mappings[m.va] = m
+        self._allocation_id_to_va[m.allocation_id] = m.va
+    def _unmap_preserving_va(self) -> None:
+        """Unmap physical memory but PRESERVE VA reservations for sleep/wake.
+        This keeps the VA reservation intact so tensors maintain stable pointers.
+        On wake, we can remap to the same VAs.
+        """
+        unmapped_count = 0
+        total_bytes = 0
+        for va, mapping in list(self._mappings.items()):
+            if mapping.handle == 0:
+                continue  # Already unmapped
+            try:
+                unmap(va, mapping.aligned_size)
+                release_handle(mapping.handle)
+                self._mappings[va] = mapping.with_handle(
+                    0
+                )  # Mark unmapped, VA reserved
+                unmapped_count += 1
+                total_bytes += mapping.aligned_size
+            except Exception as e:
+                logger.warning(
+                    f"Error unmapping VA 0x{va:x} (preserving reservation): {e}"
+                )
+        logger.info(
+            f"[GPU Memory Service] Unmapped {unmapped_count} allocations ({total_bytes / (1 << 30):.2f} GiB), "
+            f"preserving {len(self._mappings)} VA reservations"
+        )
+    def _remap_preserved_va(self, allocation_id: str) -> int:
+        """Remap an allocation to its preserved VA.
+        Requires the VA to already be reserved (from before sleep).
+        Validates allocation still exists and size matches.
+        Returns the VA.
+        Raises StaleMemoryLayoutError if allocation is missing or size changed.
+        """
+        if torch.cuda.is_available():
+            torch.cuda.set_device(self.device)
+        va = self._allocation_id_to_va.get(allocation_id)
+        if va is None:
+            raise RuntimeError(f"No preserved VA for allocation {allocation_id}")
+        mapping = self._mappings.get(va)
+        if mapping is None:
+            raise RuntimeError(f"No mapping info for VA 0x{va:x}")
+        if mapping.handle != 0:
+            return va  # Already mapped
+        client = self._client_rpc
+        # lock_type is guaranteed non-None when connected (after _client_rpc succeeds)
+        assert self.lock_type is not None
+        current_access = self.lock_type
+        # Validate allocation still exists and size matches
+        try:
+            alloc_info = client.get_allocation(allocation_id)
+        except Exception as e:
+            raise StaleMemoryLayoutError(
+                f"Allocation {allocation_id} no longer exists on server: {e}"
+            ) from e
+        server_aligned_size = int(alloc_info.aligned_size)
+        if server_aligned_size != mapping.aligned_size:
+            raise StaleMemoryLayoutError(
+                f"Allocation {allocation_id} size changed: expected {mapping.aligned_size}, got {server_aligned_size}"
+            )
+        # Re-import the handle and map to the SAME VA (which is still reserved)
+        fd = client.export(allocation_id)
+        handle = import_handle_from_fd(fd)
+        map_to_va(va, mapping.aligned_size, handle)
+        # Set access permissions based on current lock type
+        set_access(va, mapping.aligned_size, self.device, current_access)
+        # Synchronize to ensure mapping is complete before any access
+        cuda.cuCtxSynchronize()
+        # Validate the pointer is accessible (this is what Triton checks)
+        result, _dev_ptr = cuda.cuPointerGetAttribute(
+            cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER, va
+        )
+        if result != cuda.CUresult.CUDA_SUCCESS:
+            err_result, err_str = cuda.cuGetErrorString(result)
+            err_msg = ""
+            if err_result == cuda.CUresult.CUDA_SUCCESS and err_str:
+                err_msg = (
+                    err_str.decode() if isinstance(err_str, bytes) else str(err_str)
+                )
+            logger.warning(
+                f"[GPU Memory Service] cuPointerGetAttribute failed for VA 0x{va:x} after remap: "
+                f"error {result} ({err_msg})"
+            )
+        else:
+            logger.debug(
+                f"[GPU Memory Service] Remapped VA 0x{va:x} validated OK (device={self.device})"
+            )
+        # Update mapping with new handle and access
+        updated = mapping.with_handle(handle)
+        self._mappings[va] = updated.with_access(current_access)
+        return va
+    def _unmap_all(self) -> None:
+        """Unmap and release all local mappings including VA reservations."""
+        for va, mapping in list(self._mappings.items()):
+            try:
+                if mapping.handle != 0:
+                    unmap(va, mapping.aligned_size)
+                    release_handle(mapping.handle)
+                free_va(va, mapping.aligned_size)
+            except Exception as e:
+                logger.warning(f"Error unmapping VA 0x{va:x}: {e}")
+        self._mappings.clear()
+        self._allocation_id_to_va.clear()
--- a/lib/gpu_memory_service/client/rpc.py
+++ b/lib/gpu_memory_service/client/rpc.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service RPC Client.
+Low-level RPC client stub. The client provides a simple interface for acquiring
+locks and performing allocation operations. The socket connection IS the lock.
+This module has NO PyTorch dependency.
+Usage:
+    # Writer (acquires RW lock in constructor)
+    with GMSRPCClient(socket_path, lock_type=RequestedLockType.RW) as client:
+        alloc_id, aligned_size = client.allocate(size=1024*1024)
+        fd = client.export(alloc_id)
+        # ... write weights using fd ...
+        client.commit()
+    # Lock released on exit
+    # Reader (acquires RO lock in constructor)
+    client = GMSRPCClient(socket_path, lock_type=RequestedLockType.RO)
+    if client.committed:  # Check if weights are valid
+        allocations = client.list_allocations()
+        for alloc in allocations:
+            fd = client.export(alloc["allocation_id"])
+            # ... import and map fd ...
+    # Keep connection open during inference!
+    # client.close() only when done with inference
+"""
+import logging
+import socket
+from typing import Dict, List, Optional, Tuple, Type, TypeVar
+from gpu_memory_service.common.protocol.messages import (
+    AllocateRequest,
+    AllocateResponse,
+    ClearAllRequest,
+    ClearAllResponse,
+    CommitRequest,
+    CommitResponse,
+    ErrorResponse,
+    ExportRequest,
+    FreeRequest,
+    FreeResponse,
+    GetAllocationRequest,
+    GetAllocationResponse,
+    GetAllocationStateRequest,
+    GetAllocationStateResponse,
+    GetLockStateRequest,
+    GetLockStateResponse,
+    GetStateHashRequest,
+    GetStateHashResponse,
+    HandshakeRequest,
+    HandshakeResponse,
+    ListAllocationsRequest,
+    ListAllocationsResponse,
+    MetadataDeleteRequest,
+    MetadataDeleteResponse,
+    MetadataGetRequest,
+    MetadataGetResponse,
+    MetadataListRequest,
+    MetadataListResponse,
+    MetadataPutRequest,
+    MetadataPutResponse,
+)
+from gpu_memory_service.common.protocol.wire import recv_message_sync, send_message_sync
+from gpu_memory_service.common.types import (
+    RW_REQUIRED,
+    GrantedLockType,
+    RequestedLockType,
+)
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+class GMSRPCClient:
+    """GPU Memory Service RPC Client.
+    CRITICAL: Socket connection IS the lock.
+    - Constructor blocks until lock is acquired
+    - close() releases the lock
+    - committed property tells readers if weights are valid
+    For writers (lock_type=RequestedLockType.RW):
+        - Use context manager (with statement) for automatic lock release
+        - Call commit() after weights are written
+        - Call clear_all() before loading new model
+    For readers (lock_type=RequestedLockType.RO):
+        - Check committed property after construction
+        - Keep connection open during inference lifetime
+        - Only call close() when shutting down or allowing weight updates
+    """
+    def __init__(
+        self,
+        socket_path: str,
+        lock_type: RequestedLockType = RequestedLockType.RO,
+        timeout_ms: Optional[int] = None,
+    ):
+        """Connect to Allocation Server and acquire lock.
+        Args:
+            socket_path: Path to server's Unix domain socket
+            lock_type: Requested lock type (RW, RO, or RW_OR_RO)
+            timeout_ms: Timeout in milliseconds for lock acquisition.
+                        None means wait indefinitely.
+        Raises:
+            ConnectionError: If connection fails
+            TimeoutError: If timeout_ms expires waiting for lock
+        """
+        self.socket_path = socket_path
+        self._requested_lock_type = lock_type
+        self._socket: Optional[socket.socket] = None
+        self._recv_buffer = bytearray()
+        self._committed = False
+        self._granted_lock_type: Optional[GrantedLockType] = None
+        # Connect and acquire lock
+        self._connect(timeout_ms=timeout_ms)
+    def _connect(self, timeout_ms: Optional[int]) -> None:
+        """Connect to server and perform handshake (lock acquisition)."""
+        self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        try:
+            self._socket.connect(self.socket_path)
+        except FileNotFoundError:
+            raise ConnectionError(f"Server not running at {self.socket_path}") from None
+        except Exception as e:
+            raise ConnectionError(f"Failed to connect: {e}") from e
+        # Send handshake (this IS lock acquisition)
+        request = HandshakeRequest(
+            lock_type=self._requested_lock_type, timeout_ms=timeout_ms
+        )
+        send_message_sync(self._socket, request)
+        # Receive response (may block waiting for lock)
+        response, _, self._recv_buffer = recv_message_sync(
+            self._socket, self._recv_buffer
+        )
+        if isinstance(response, ErrorResponse):
+            self._socket.close()
+            self._socket = None
+            raise ConnectionError(f"Handshake error: {response.error}")
+        if not isinstance(response, HandshakeResponse):
+            self._socket.close()
+            self._socket = None
+            raise ConnectionError(f"Unexpected response: {type(response)}")
+        if not response.success:
+            self._socket.close()
+            self._socket = None
+            raise TimeoutError("Timeout waiting for lock")
+        self._committed = response.committed
+        # Store granted lock type (may differ from requested for rw_or_ro mode)
+        if response.granted_lock_type is not None:
+            self._granted_lock_type = response.granted_lock_type
+        elif self._requested_lock_type == RequestedLockType.RW:
+            self._granted_lock_type = GrantedLockType.RW
+        else:
+            self._granted_lock_type = GrantedLockType.RO
+        logger.info(
+            f"Connected with {self._requested_lock_type.value} lock (granted={self._granted_lock_type.value}), "
+            f"committed={self._committed}"
+        )
+    @property
+    def committed(self) -> bool:
+        """Check if weights are committed (valid)."""
+        return self._committed
+    @property
+    def lock_type(self) -> Optional[GrantedLockType]:
+        """Get the lock type actually granted by the server.
+        For rw_or_ro mode, this tells you whether RW or RO was granted.
+        """
+        return self._granted_lock_type
+    @property
+    def is_connected(self) -> bool:
+        """Check if client is connected."""
+        return self._socket is not None
+    def _send_recv(self, request) -> Tuple[object, int]:
+        """Send request and receive response. Returns (response, fd)."""
+        if not self._socket:
+            raise RuntimeError("Client not connected")
+        send_message_sync(self._socket, request)
+        response, fd, self._recv_buffer = recv_message_sync(
+            self._socket, self._recv_buffer
+        )
+        if isinstance(response, ErrorResponse):
+            raise RuntimeError(f"Server error: {response.error}")
+        return response, fd
+    def _call(self, request, response_type: Type[T]) -> T:
+        """Send request, validate response type, return typed response."""
+        if type(request) in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
+            raise RuntimeError("Operation requires RW connection")
+        response, _ = self._send_recv(request)
+        if not isinstance(response, response_type):
+            raise RuntimeError(f"Unexpected response: {type(response)}")
+        return response
+    def get_lock_state(self) -> GetLockStateResponse:
+        return self._call(GetLockStateRequest(), GetLockStateResponse)
+    def get_allocation_state(self) -> GetAllocationStateResponse:
+        return self._call(GetAllocationStateRequest(), GetAllocationStateResponse)
+    def is_ready(self) -> bool:
+        return self.committed
+    def commit(self) -> bool:
+        """Commit weights and release RW lock. Returns True on success."""
+        if CommitRequest in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
+            raise RuntimeError("Operation requires RW connection")
+        try:
+            response, _ = self._send_recv(CommitRequest())
+            ok = isinstance(response, CommitResponse) and response.success
+        except (ConnectionResetError, BrokenPipeError, OSError) as e:
+            # Server closes RW socket as part of commit
+            logger.debug(
+                f"Commit saw socket error ({type(e).__name__}); verifying via RO connect"
+            )
+            self.close()
+            try:
+                ro = GMSRPCClient(
+                    self.socket_path, lock_type=RequestedLockType.RO, timeout_ms=1000
+                )
+                try:
+                    ok = ro.committed
+                finally:
+                    ro.close()
+            except TimeoutError:
+                ok = False
+        if ok:
+            self._committed = True
+            self.close()
+            logger.info("Committed weights and released RW connection")
+            return True
+        return False
+    def allocate(self, size: int, tag: str = "default") -> Tuple[str, int]:
+        """Returns (allocation_id, aligned_size)."""
+        r = self._call(AllocateRequest(size=size, tag=tag), AllocateResponse)
+        return r.allocation_id, r.aligned_size
+    def export(self, allocation_id: str) -> int:
+        """Export allocation as POSIX FD. Caller must close."""
+        _, fd = self._send_recv(ExportRequest(allocation_id=allocation_id))
+        if fd < 0:
+            raise RuntimeError("No FD received from server")
+        return fd
+    def get_allocation(self, allocation_id: str) -> GetAllocationResponse:
+        return self._call(
+            GetAllocationRequest(allocation_id=allocation_id), GetAllocationResponse
+        )
+    def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
+        return self._call(
+            ListAllocationsRequest(tag=tag), ListAllocationsResponse
+        ).allocations
+    def free(self, allocation_id: str) -> bool:
+        return self._call(
+            FreeRequest(allocation_id=allocation_id), FreeResponse
+        ).success
+    def clear_all(self) -> int:
+        return self._call(ClearAllRequest(), ClearAllResponse).cleared_count
+    def metadata_put(
+        self, key: str, allocation_id: str, offset_bytes: int, value: bytes
+    ) -> bool:
+        req = MetadataPutRequest(
+            key=key, allocation_id=allocation_id, offset_bytes=offset_bytes, value=value
+        )
+        return self._call(req, MetadataPutResponse).success
+    def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
+        """Returns (allocation_id, offset_bytes, value) or None if not found."""
+        r = self._call(MetadataGetRequest(key=key), MetadataGetResponse)
+        return (r.allocation_id, r.offset_bytes, r.value) if r.found else None
+    def metadata_delete(self, key: str) -> bool:
+        return self._call(
+            MetadataDeleteRequest(key=key), MetadataDeleteResponse
+        ).deleted
+    def metadata_list(self, prefix: str = "") -> List[str]:
+        return self._call(MetadataListRequest(prefix=prefix), MetadataListResponse).keys
+    def get_memory_layout_hash(self) -> str:
+        """Get state hash (hash of allocations + metadata). Empty if not committed."""
+        return self._call(
+            GetStateHashRequest(), GetStateHashResponse
+        ).memory_layout_hash
+    def close(self) -> None:
+        """Close connection and release lock."""
+        if self._socket:
+            try:
+                self._socket.close()
+            except Exception:
+                pass
+            self._socket = None
+            lock_str = self.lock_type.value if self.lock_type else "unknown"
+            logger.info(f"Closed {lock_str} connection")
+    def __enter__(self) -> "GMSRPCClient":
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Context manager exit."""
+        self.close()
+    def __del__(self):
+        """Destructor: warn if connection not closed."""
+        if self._socket:
+            logger.warning("GMSRPCClient not closed properly")
--- a/lib/gpu_memory_service/client/torch/__init__.py
+++ b/lib/gpu_memory_service/client/torch/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch integration for GPU Memory Service.
+This module provides PyTorch-specific functionality:
+- Memory manager singleton management
+- Tensor utilities (metadata, registration, materialization)
+- C++ extension for CUDAPluggableAllocator
+"""
+from gpu_memory_service.client.torch.allocator import (
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+__all__ = [
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor operations (public API)
+    "register_module_tensors",
+    "materialize_module_from_gms",
+]