feat: GPU Memory Service (#5286)

Signed-off-by: Schwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>

feat: GPU Memory Service (#5286)
Signed-off-by: Schwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
30c6228b · Schwinn Saereesitthipitak · GitHub · cde3b2a5 · 30c6228b · 30c6228b
Unverified Commit 30c6228b authored Jan 23, 2026 by Schwinn Saereesitthipitak Committed by GitHub Jan 23, 2026
20 changed files
--- a/.devcontainer/post-create.sh
+++ b/.devcontainer/post-create.sh
@@ -111,7 +111,8 @@ $SANITY_STATUS
 Now build the project:
  cargo build --locked --profile dev --features dynamo-llm/block-manager
  cd lib/bindings/python && maturin develop --uv
-  DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e .
+  uv pip install -e lib/gpu_memory_service  # GPU memory manager with C++ extension
+  DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
 Optional: cd lib/bindings/kvbm && maturin develop --uv  # For KVBM support

--- a/.dockerignore
+++ b/.dockerignore
@@ -45,6 +45,10 @@ container/Dockerfile*
 .venv
 .venv-docs
+# GPU Memory Service build artifacts
+lib/gpu_memory_service/build/
+lib/gpu_memory_service/*.egg-info/
+lib/gpu_memory_service/**/*.so
 # Python
 __pycache__/

--- a/.github/filters.yaml
+++ b/.github/filters.yaml
@@ -78,6 +78,7 @@ core:
  - 'components/src/dynamo/mocker/**'
  - 'components/src/dynamo/frontend/**'
  - 'components/src/dynamo/common/**'
+  - 'components/src/dynamo/gpu_memory_service/**'
  - '*.toml'
  - '*.lock'
  - '*.py'

--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
 tensorrtllm_engines/
 api_server_models/
 server/
+!lib/gpu_memory_service/server/
 # Replay/Snapshot test artifacts
 *.new
 lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/

--- a/README.md
+++ b/README.md
@@ -331,7 +331,16 @@ cd lib/bindings/python
 maturin develop --uv
 ```
-## 6. Install the Wheel
+## 6. Install GPU Memory Service
+The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
+```bash
+cd $PROJECT_ROOT
+uv pip install -e lib/gpu_memory_service
+```
+## 7. Install the Wheel
 ```
 cd $PROJECT_ROOT

--- a/components/src/dynamo/gpu_memory_service/__init__.py
+++ b/components/src/dynamo/gpu_memory_service/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service component for Dynamo.
+This module provides the Dynamo component wrapper around the gpu_memory_service package.
+The core functionality is in the gpu_memory_service package; this module provides:
+- CLI entry point (python -m dynamo.gpu_memory_service)
+- Re-exports for backwards compatibility
+"""
+# Re-export core functionality from gpu_memory_service package
+from gpu_memory_service import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+# Re-export extensions (built separately)
+try:
+    from gpu_memory_service.client.torch.extensions import _allocator_ext
+except (ImportError, OSError):
+    _allocator_ext = None
+# Re-export module utilities
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+__all__ = [
+    # Core
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor utilities
+    "register_module_tensors",
+    "materialize_module_from_gms",
+    # Extensions
+    "_allocator_ext",
+]
--- a/components/src/dynamo/gpu_memory_service/__main__.py
+++ b/components/src/dynamo/gpu_memory_service/__main__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from dynamo.gpu_memory_service.server import main
+if __name__ == "__main__":
+    main()
--- a/components/src/dynamo/gpu_memory_service/args.py
+++ b/components/src/dynamo/gpu_memory_service/args.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Argument parsing for GPU Memory Service server component."""
+import argparse
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class Config:
+    """Configuration for GPU Memory Service server."""
+    # GPU Memory Service specific
+    device: int
+    socket_path: str
+    verbose: bool
+def parse_args() -> Config:
+    """Parse command line arguments for GPU Memory Service server."""
+    parser = argparse.ArgumentParser(
+        description="GPU Memory Service allocation server for Dynamo."
+    )
+    # GPU Memory Service specific arguments
+    parser.add_argument(
+        "--device",
+        type=int,
+        required=True,
+        help="CUDA device ID to manage memory for.",
+    )
+    parser.add_argument(
+        "--socket-path",
+        type=str,
+        default=None,
+        help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
+        "Supports {device} placeholder for multi-GPU setups.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging.",
+    )
+    args = parser.parse_args()
+    # Generate default socket path if not provided
+    socket_path = args.socket_path
+    if socket_path is None:
+        socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
+    else:
+        # Expand {device} placeholder
+        socket_path = socket_path.format(device=args.device)
+    config = Config(
+        device=args.device,
+        socket_path=socket_path,
+        verbose=args.verbose,
+    )
+    return config
--- a/components/src/dynamo/gpu_memory_service/server.py
+++ b/components/src/dynamo/gpu_memory_service/server.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service allocation server component for Dynamo.
+This component wraps the GMSRPCServer from gpu_memory_service to manage
+GPU memory allocations with connection-based RW/RO locking.
+Workers connect via the socket path, which should be passed to vLLM/SGLang via:
+    --load-format gpu_memory_service
+    --model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
+Usage:
+    python -m dynamo.gpu_memory_service --device 0
+    python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
+"""
+import asyncio
+import logging
+import signal
+import uvloop
+from gpu_memory_service.server import GMSRPCServer
+from .args import parse_args
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+async def worker() -> None:
+    """Main async worker function."""
+    config = parse_args()
+    # Configure logging level
+    if config.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
+    logger.info(f"Starting GPU Memory Service Server for device {config.device}")
+    logger.info(f"Socket path: {config.socket_path}")
+    server = GMSRPCServer(config.socket_path, device=config.device)
+    # Set up shutdown handling
+    shutdown_event = asyncio.Event()
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        shutdown_event.set()
+    loop = asyncio.get_running_loop()
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+    await server.start()
+    logger.info("GPU Memory Service Server ready, waiting for connections...")
+    logger.info(
+        f"To connect vLLM workers, use: --load-format gpu_memory_service "
+        f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
+    )
+    # Wait for shutdown signal
+    try:
+        await shutdown_event.wait()
+    finally:
+        logger.info("Shutting down GPU Memory Service Server...")
+        await server.stop()
+        logger.info("GPU Memory Service Server shutdown complete")
+def main() -> None:
+    """Entry point for GPU Memory Service server."""
+    uvloop.install()
+    asyncio.run(worker())
+if __name__ == "__main__":
+    main()
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##############################################
 ########## Runtime image ##############
 ##############################################
@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
 # Install dynamo wheels (runtime packages only, no test dependencies)
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
        --requirement /tmp/requirements.test.txt
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##################################
 ########## Runtime Image #########
 ##################################
@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
 ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
 # Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
+ARG ENABLE_GPU_MEMORY_SERVICE
 RUN --mount=type=bind,source=.,target=/mnt/local_src \
    pip install --no-cache-dir --break-system-packages \
        /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
        /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
        /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-        sglang==${SGLANG_VERSION}
+        sglang==${SGLANG_VERSION} && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
+    fi
 # Install common and test dependencies as root
 RUN --mount=type=bind,source=.,target=/mnt/local_src \

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ##################################################
 ########## Framework Builder Stage ##############
 ##################################################
@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
 # Install dynamo, NIXL, and dynamo-specific dependencies
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
 RUN uv pip install \
      --no-cache \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install --no-cache "$GMS_WHEEL"; \
+    fi && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 ARG ENABLE_MEDIA_NIXL
 ARG ENABLE_MEDIA_FFMPEG
 ARG CARGO_BUILD_JOBS
@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo"
+# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
+ARG ENABLE_GPU_MEMORY_SERVICE
+RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
+        source ${VIRTUAL_ENV}/bin/activate && \
+        uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
+    fi
 ########################################################
 ########## Framework Development Image ################
 ########################################################
@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin
 COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
 COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
 COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
+COPY --from=dynamo_base /usr/local/cuda/lib64/stubs/ /usr/local/cuda/lib64/stubs/
 RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
    ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
    ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so
@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
 # Install dynamo, NIXL, and dynamo-specific dependencies
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
 ARG ENABLE_KVBM
+ARG ENABLE_GPU_MEMORY_SERVICE
 COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
 RUN uv pip install \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
+        GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
+        if [ -z "$GMS_WHEEL" ]; then \
+            echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$GMS_WHEEL"; \
+    fi && \
    if [ "${ENABLE_KVBM}" = "true" ]; then \
        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
        if [ -z "$KVBM_WHEEL" ]; then \
@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \
    ldconfig
 USER dynamo
 ARG DYNAMO_COMMIT_SHA
 ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA

--- a/container/build.sh
+++ b/container/build.sh
@@ -156,6 +156,10 @@ PUSH=""
 # or can be explicitly enabled via --enable-kvbm flag
 ENABLE_KVBM=false
+# GPU Memory Service - default disabled, enabled automatically for VLLM/SGLANG
+# or can be explicitly enabled via --enable-gpu-memory-service flag
+ENABLE_GPU_MEMORY_SERVICE=false
 # sccache configuration for S3
 USE_SCCACHE=""
 SCCACHE_BUCKET=""
@@ -343,6 +347,9 @@ get_options() {
        --enable-kvbm)
            ENABLE_KVBM=true
            ;;
+        --enable-gpu-memory-service)
+            ENABLE_GPU_MEMORY_SERVICE=true
+            ;;
        --enable-media-nixl)
            ENABLE_MEDIA_NIXL=true
            ;;
@@ -539,6 +546,7 @@ show_help() {
    echo "  [--release-build perform a release build]"
    echo "  [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]"
    echo "  [--enable-kvbm Enables KVBM support in Python 3.12]"
+    echo "  [--enable-gpu-memory-service Enables GPU Memory Service support]"
    echo "  [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]"
    echo "  [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]"
    echo "  [--use-sccache enable sccache for Rust/C/C++ compilation caching]"
@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then
    BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} "
 fi
+# ENABLE_GPU_MEMORY_SERVICE: Used in Dockerfiles for gpu_memory_service wheel.
+#                            Declared but not currently used in Dockerfile.trtllm.
+# Force GPU Memory Service to be enabled for VLLM and SGLANG frameworks
+if [[ $FRAMEWORK == "VLLM" ]] || [[ $FRAMEWORK == "SGLANG" ]]; then
+    echo "Forcing enable_gpu_memory_service to true in ${FRAMEWORK} image build"
+    ENABLE_GPU_MEMORY_SERVICE=true
+fi
+# For other frameworks, ENABLE_GPU_MEMORY_SERVICE defaults to false unless --enable-gpu-memory-service flag was provided
+if [[ ${ENABLE_GPU_MEMORY_SERVICE} == "true" ]]; then
+    echo "Enabling GPU Memory Service in the dynamo image"
+    BUILD_ARGS+=" --build-arg ENABLE_GPU_MEMORY_SERVICE=${ENABLE_GPU_MEMORY_SERVICE} "
+fi
 # ENABLE_MEDIA_NIXL: Enable media processing with NIXL support
 # Used in base Dockerfile for maturin build feature flag.
 # Can be explicitly overridden with --enable-media-nixl flag

--- a/lib/gpu_memory_service/__init__.py
+++ b/lib/gpu_memory_service/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service - out-of-process GPU memory manager.
+The GPU Memory Service decouples ownership of GPU memory from the processes
+that use it, enabling zero-copy sharing and data survival across process crashes.
+Package structure:
+- common/: Shared types and protocol (used by both server and client)
+- server/: Allocation server daemon (no CUDA context required)
+- client/: Client library for memory management
+  - client/torch/: PyTorch integration (allocator, tensor, module, extensions)
+Primary client API:
+    from gpu_memory_service import (
+        GMSClientMemoryManager,
+        get_or_create_gms_client_memory_manager,
+        get_gms_client_memory_manager,
+    )
+Server API:
+    from gpu_memory_service.server import GMSRPCServer
+"""
+# Primary client exports
+from gpu_memory_service.client.memory_manager import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+)
+# PyTorch integration (GMS client memory manager)
+from gpu_memory_service.client.torch.allocator import (
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+__all__ = [
+    # Client
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+]
--- a/lib/gpu_memory_service/client/__init__.py
+++ b/lib/gpu_memory_service/client/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service client library.
+This module provides the client-side components for interacting with the
+GPU Memory Service:
+- GMSClientMemoryManager: Manages local VA mappings of remote GPU memory
+- GMSRPCClient: Low-level RPC client (pure Python, no PyTorch dependency)
+For PyTorch integration (MemPool, tensor utilities), see gpu_memory_service.client.torch.
+"""
+from gpu_memory_service.client.memory_manager import (
+    GMSClientMemoryManager,
+    StaleMemoryLayoutError,
+)
+from gpu_memory_service.client.rpc import GMSRPCClient
+__all__ = [
+    "GMSClientMemoryManager",
+    "StaleMemoryLayoutError",
+    "GMSRPCClient",
+]
--- a/lib/gpu_memory_service/client/cuda_vmm_utils.py
+++ b/lib/gpu_memory_service/client/cuda_vmm_utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Client-side CUDA VMM utilities.
+These functions wrap CUDA driver API calls used by the client memory manager
+for importing, mapping, and unmapping GPU memory.
+"""
+from __future__ import annotations
+from cuda.bindings import driver as cuda
+from gpu_memory_service.common.cuda_vmm_utils import check_cuda_result
+from gpu_memory_service.common.types import GrantedLockType
+def import_handle_from_fd(fd: int) -> int:
+    """Import a CUDA memory handle from a file descriptor.
+    Args:
+        fd: POSIX file descriptor received via SCM_RIGHTS.
+    Returns:
+        CUDA memory handle.
+    """
+    result, handle = cuda.cuMemImportFromShareableHandle(
+        fd,
+        cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+    )
+    check_cuda_result(result, "cuMemImportFromShareableHandle")
+    return int(handle)
+def reserve_va(size: int, granularity: int) -> int:
+    """Reserve virtual address space.
+    Args:
+        size: Size in bytes (should be aligned to granularity).
+        granularity: VMM allocation granularity.
+    Returns:
+        Reserved virtual address.
+    """
+    result, va = cuda.cuMemAddressReserve(size, granularity, 0, 0)
+    check_cuda_result(result, "cuMemAddressReserve")
+    return int(va)
+def free_va(va: int, size: int) -> None:
+    """Free a virtual address reservation.
+    Args:
+        va: Virtual address to free.
+        size: Size of the reservation.
+    """
+    (result,) = cuda.cuMemAddressFree(va, size)
+    check_cuda_result(result, "cuMemAddressFree")
+def map_to_va(va: int, size: int, handle: int) -> None:
+    """Map a CUDA handle to a virtual address.
+    Args:
+        va: Virtual address (must be reserved).
+        size: Size of the mapping.
+        handle: CUDA memory handle.
+    """
+    (result,) = cuda.cuMemMap(va, size, 0, handle, 0)
+    check_cuda_result(result, "cuMemMap")
+def set_access(va: int, size: int, device: int, access: GrantedLockType) -> None:
+    """Set access permissions for a mapped region.
+    Args:
+        va: Virtual address.
+        size: Size of the region.
+        device: CUDA device index.
+        access: Access mode - RO for read-only, RW for read-write.
+    """
+    acc = cuda.CUmemAccessDesc()
+    acc.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    acc.location.id = device
+    acc.flags = (
+        cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READ
+        if access == GrantedLockType.RO
+        else cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+    )
+    (result,) = cuda.cuMemSetAccess(va, size, [acc], 1)
+    check_cuda_result(result, "cuMemSetAccess")
+def unmap(va: int, size: int) -> None:
+    """Unmap a virtual address region.
+    Args:
+        va: Virtual address to unmap.
+        size: Size of the mapping.
+    """
+    (result,) = cuda.cuMemUnmap(va, size)
+    check_cuda_result(result, "cuMemUnmap")
+def release_handle(handle: int) -> None:
+    """Release a CUDA memory handle.
+    Args:
+        handle: CUDA memory handle to release.
+    """
+    (result,) = cuda.cuMemRelease(handle)
+    check_cuda_result(result, "cuMemRelease")
--- a/lib/gpu_memory_service/client/memory_manager.py
+++ b/lib/gpu_memory_service/client/memory_manager.py
--- a/lib/gpu_memory_service/client/rpc.py
+++ b/lib/gpu_memory_service/client/rpc.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""GPU Memory Service RPC Client.
+Low-level RPC client stub. The client provides a simple interface for acquiring
+locks and performing allocation operations. The socket connection IS the lock.
+This module has NO PyTorch dependency.
+Usage:
+    # Writer (acquires RW lock in constructor)
+    with GMSRPCClient(socket_path, lock_type=RequestedLockType.RW) as client:
+        alloc_id, aligned_size = client.allocate(size=1024*1024)
+        fd = client.export(alloc_id)
+        # ... write weights using fd ...
+        client.commit()
+    # Lock released on exit
+    # Reader (acquires RO lock in constructor)
+    client = GMSRPCClient(socket_path, lock_type=RequestedLockType.RO)
+    if client.committed:  # Check if weights are valid
+        allocations = client.list_allocations()
+        for alloc in allocations:
+            fd = client.export(alloc["allocation_id"])
+            # ... import and map fd ...
+    # Keep connection open during inference!
+    # client.close() only when done with inference
+"""
+import logging
+import socket
+from typing import Dict, List, Optional, Tuple, Type, TypeVar
+from gpu_memory_service.common.protocol.messages import (
+    AllocateRequest,
+    AllocateResponse,
+    ClearAllRequest,
+    ClearAllResponse,
+    CommitRequest,
+    CommitResponse,
+    ErrorResponse,
+    ExportRequest,
+    FreeRequest,
+    FreeResponse,
+    GetAllocationRequest,
+    GetAllocationResponse,
+    GetAllocationStateRequest,
+    GetAllocationStateResponse,
+    GetLockStateRequest,
+    GetLockStateResponse,
+    GetStateHashRequest,
+    GetStateHashResponse,
+    HandshakeRequest,
+    HandshakeResponse,
+    ListAllocationsRequest,
+    ListAllocationsResponse,
+    MetadataDeleteRequest,
+    MetadataDeleteResponse,
+    MetadataGetRequest,
+    MetadataGetResponse,
+    MetadataListRequest,
+    MetadataListResponse,
+    MetadataPutRequest,
+    MetadataPutResponse,
+)
+from gpu_memory_service.common.protocol.wire import recv_message_sync, send_message_sync
+from gpu_memory_service.common.types import (
+    RW_REQUIRED,
+    GrantedLockType,
+    RequestedLockType,
+)
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+class GMSRPCClient:
+    """GPU Memory Service RPC Client.
+    CRITICAL: Socket connection IS the lock.
+    - Constructor blocks until lock is acquired
+    - close() releases the lock
+    - committed property tells readers if weights are valid
+    For writers (lock_type=RequestedLockType.RW):
+        - Use context manager (with statement) for automatic lock release
+        - Call commit() after weights are written
+        - Call clear_all() before loading new model
+    For readers (lock_type=RequestedLockType.RO):
+        - Check committed property after construction
+        - Keep connection open during inference lifetime
+        - Only call close() when shutting down or allowing weight updates
+    """
+    def __init__(
+        self,
+        socket_path: str,
+        lock_type: RequestedLockType = RequestedLockType.RO,
+        timeout_ms: Optional[int] = None,
+    ):
+        """Connect to Allocation Server and acquire lock.
+        Args:
+            socket_path: Path to server's Unix domain socket
+            lock_type: Requested lock type (RW, RO, or RW_OR_RO)
+            timeout_ms: Timeout in milliseconds for lock acquisition.
+                        None means wait indefinitely.
+        Raises:
+            ConnectionError: If connection fails
+            TimeoutError: If timeout_ms expires waiting for lock
+        """
+        self.socket_path = socket_path
+        self._requested_lock_type = lock_type
+        self._socket: Optional[socket.socket] = None
+        self._recv_buffer = bytearray()
+        self._committed = False
+        self._granted_lock_type: Optional[GrantedLockType] = None
+        # Connect and acquire lock
+        self._connect(timeout_ms=timeout_ms)
+    def _connect(self, timeout_ms: Optional[int]) -> None:
+        """Connect to server and perform handshake (lock acquisition)."""
+        self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        try:
+            self._socket.connect(self.socket_path)
+        except FileNotFoundError:
+            raise ConnectionError(f"Server not running at {self.socket_path}") from None
+        except Exception as e:
+            raise ConnectionError(f"Failed to connect: {e}") from e
+        # Send handshake (this IS lock acquisition)
+        request = HandshakeRequest(
+            lock_type=self._requested_lock_type, timeout_ms=timeout_ms
+        )
+        send_message_sync(self._socket, request)
+        # Receive response (may block waiting for lock)
+        response, _, self._recv_buffer = recv_message_sync(
+            self._socket, self._recv_buffer
+        )
+        if isinstance(response, ErrorResponse):
+            self._socket.close()
+            self._socket = None
+            raise ConnectionError(f"Handshake error: {response.error}")
+        if not isinstance(response, HandshakeResponse):
+            self._socket.close()
+            self._socket = None
+            raise ConnectionError(f"Unexpected response: {type(response)}")
+        if not response.success:
+            self._socket.close()
+            self._socket = None
+            raise TimeoutError("Timeout waiting for lock")
+        self._committed = response.committed
+        # Store granted lock type (may differ from requested for rw_or_ro mode)
+        if response.granted_lock_type is not None:
+            self._granted_lock_type = response.granted_lock_type
+        elif self._requested_lock_type == RequestedLockType.RW:
+            self._granted_lock_type = GrantedLockType.RW
+        else:
+            self._granted_lock_type = GrantedLockType.RO
+        logger.info(
+            f"Connected with {self._requested_lock_type.value} lock (granted={self._granted_lock_type.value}), "
+            f"committed={self._committed}"
+        )
+    @property
+    def committed(self) -> bool:
+        """Check if weights are committed (valid)."""
+        return self._committed
+    @property
+    def lock_type(self) -> Optional[GrantedLockType]:
+        """Get the lock type actually granted by the server.
+        For rw_or_ro mode, this tells you whether RW or RO was granted.
+        """
+        return self._granted_lock_type
+    @property
+    def is_connected(self) -> bool:
+        """Check if client is connected."""
+        return self._socket is not None
+    def _send_recv(self, request) -> Tuple[object, int]:
+        """Send request and receive response. Returns (response, fd)."""
+        if not self._socket:
+            raise RuntimeError("Client not connected")
+        send_message_sync(self._socket, request)
+        response, fd, self._recv_buffer = recv_message_sync(
+            self._socket, self._recv_buffer
+        )
+        if isinstance(response, ErrorResponse):
+            raise RuntimeError(f"Server error: {response.error}")
+        return response, fd
+    def _call(self, request, response_type: Type[T]) -> T:
+        """Send request, validate response type, return typed response."""
+        if type(request) in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
+            raise RuntimeError("Operation requires RW connection")
+        response, _ = self._send_recv(request)
+        if not isinstance(response, response_type):
+            raise RuntimeError(f"Unexpected response: {type(response)}")
+        return response
+    def get_lock_state(self) -> GetLockStateResponse:
+        return self._call(GetLockStateRequest(), GetLockStateResponse)
+    def get_allocation_state(self) -> GetAllocationStateResponse:
+        return self._call(GetAllocationStateRequest(), GetAllocationStateResponse)
+    def is_ready(self) -> bool:
+        return self.committed
+    def commit(self) -> bool:
+        """Commit weights and release RW lock. Returns True on success."""
+        if CommitRequest in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
+            raise RuntimeError("Operation requires RW connection")
+        try:
+            response, _ = self._send_recv(CommitRequest())
+            ok = isinstance(response, CommitResponse) and response.success
+        except (ConnectionResetError, BrokenPipeError, OSError) as e:
+            # Server closes RW socket as part of commit
+            logger.debug(
+                f"Commit saw socket error ({type(e).__name__}); verifying via RO connect"
+            )
+            self.close()
+            try:
+                ro = GMSRPCClient(
+                    self.socket_path, lock_type=RequestedLockType.RO, timeout_ms=1000
+                )
+                try:
+                    ok = ro.committed
+                finally:
+                    ro.close()
+            except TimeoutError:
+                ok = False
+        if ok:
+            self._committed = True
+            self.close()
+            logger.info("Committed weights and released RW connection")
+            return True
+        return False
+    def allocate(self, size: int, tag: str = "default") -> Tuple[str, int]:
+        """Returns (allocation_id, aligned_size)."""
+        r = self._call(AllocateRequest(size=size, tag=tag), AllocateResponse)
+        return r.allocation_id, r.aligned_size
+    def export(self, allocation_id: str) -> int:
+        """Export allocation as POSIX FD. Caller must close."""
+        _, fd = self._send_recv(ExportRequest(allocation_id=allocation_id))
+        if fd < 0:
+            raise RuntimeError("No FD received from server")
+        return fd
+    def get_allocation(self, allocation_id: str) -> GetAllocationResponse:
+        return self._call(
+            GetAllocationRequest(allocation_id=allocation_id), GetAllocationResponse
+        )
+    def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
+        return self._call(
+            ListAllocationsRequest(tag=tag), ListAllocationsResponse
+        ).allocations
+    def free(self, allocation_id: str) -> bool:
+        return self._call(
+            FreeRequest(allocation_id=allocation_id), FreeResponse
+        ).success
+    def clear_all(self) -> int:
+        return self._call(ClearAllRequest(), ClearAllResponse).cleared_count
+    def metadata_put(
+        self, key: str, allocation_id: str, offset_bytes: int, value: bytes
+    ) -> bool:
+        req = MetadataPutRequest(
+            key=key, allocation_id=allocation_id, offset_bytes=offset_bytes, value=value
+        )
+        return self._call(req, MetadataPutResponse).success
+    def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
+        """Returns (allocation_id, offset_bytes, value) or None if not found."""
+        r = self._call(MetadataGetRequest(key=key), MetadataGetResponse)
+        return (r.allocation_id, r.offset_bytes, r.value) if r.found else None
+    def metadata_delete(self, key: str) -> bool:
+        return self._call(
+            MetadataDeleteRequest(key=key), MetadataDeleteResponse
+        ).deleted
+    def metadata_list(self, prefix: str = "") -> List[str]:
+        return self._call(MetadataListRequest(prefix=prefix), MetadataListResponse).keys
+    def get_memory_layout_hash(self) -> str:
+        """Get state hash (hash of allocations + metadata). Empty if not committed."""
+        return self._call(
+            GetStateHashRequest(), GetStateHashResponse
+        ).memory_layout_hash
+    def close(self) -> None:
+        """Close connection and release lock."""
+        if self._socket:
+            try:
+                self._socket.close()
+            except Exception:
+                pass
+            self._socket = None
+            lock_str = self.lock_type.value if self.lock_type else "unknown"
+            logger.info(f"Closed {lock_str} connection")
+    def __enter__(self) -> "GMSRPCClient":
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Context manager exit."""
+        self.close()
+    def __del__(self):
+        """Destructor: warn if connection not closed."""
+        if self._socket:
+            logger.warning("GMSRPCClient not closed properly")
--- a/lib/gpu_memory_service/client/torch/__init__.py
+++ b/lib/gpu_memory_service/client/torch/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch integration for GPU Memory Service.
+This module provides PyTorch-specific functionality:
+- Memory manager singleton management
+- Tensor utilities (metadata, registration, materialization)
+- C++ extension for CUDAPluggableAllocator
+"""
+from gpu_memory_service.client.torch.allocator import (
+    get_gms_client_memory_manager,
+    get_or_create_gms_client_memory_manager,
+)
+from gpu_memory_service.client.torch.module import (
+    materialize_module_from_gms,
+    register_module_tensors,
+)
+__all__ = [
+    # GMS client memory manager
+    "get_or_create_gms_client_memory_manager",
+    "get_gms_client_memory_manager",
+    # Tensor operations (public API)
+    "register_module_tensors",
+    "materialize_module_from_gms",
+]