"lib/kv-router/src/vscode:/vscode.git/clone" did not exist on "b35db6e2cacee540854bbe5289f23ed0c3a2b5d7"
Unverified Commit 30c6228b authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat: GPU Memory Service (#5286)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent cde3b2a5
...@@ -111,7 +111,8 @@ $SANITY_STATUS ...@@ -111,7 +111,8 @@ $SANITY_STATUS
Now build the project: Now build the project:
cargo build --locked --profile dev --features dynamo-llm/block-manager cargo build --locked --profile dev --features dynamo-llm/block-manager
cd lib/bindings/python && maturin develop --uv cd lib/bindings/python && maturin develop --uv
DYNAMO_BIN_PATH=$CARGO_TARGET_DIR/debug uv pip install -e . uv pip install -e lib/gpu_memory_service # GPU memory manager with C++ extension
DYNAMO_BIN_PATH=\$CARGO_TARGET_DIR/debug uv pip install -e .
Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support Optional: cd lib/bindings/kvbm && maturin develop --uv # For KVBM support
......
...@@ -45,6 +45,10 @@ container/Dockerfile* ...@@ -45,6 +45,10 @@ container/Dockerfile*
.venv .venv
.venv-docs .venv-docs
# GPU Memory Service build artifacts
lib/gpu_memory_service/build/
lib/gpu_memory_service/*.egg-info/
lib/gpu_memory_service/**/*.so
# Python # Python
__pycache__/ __pycache__/
......
...@@ -78,6 +78,7 @@ core: ...@@ -78,6 +78,7 @@ core:
- 'components/src/dynamo/mocker/**' - 'components/src/dynamo/mocker/**'
- 'components/src/dynamo/frontend/**' - 'components/src/dynamo/frontend/**'
- 'components/src/dynamo/common/**' - 'components/src/dynamo/common/**'
- 'components/src/dynamo/gpu_memory_service/**'
- '*.toml' - '*.toml'
- '*.lock' - '*.lock'
- '*.py' - '*.py'
......
...@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/ ...@@ -57,6 +57,7 @@ tensorrtllm_checkpoints/
tensorrtllm_engines/ tensorrtllm_engines/
api_server_models/ api_server_models/
server/ server/
!lib/gpu_memory_service/server/
# Replay/Snapshot test artifacts # Replay/Snapshot test artifacts
*.new *.new
lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/ lib/llm/tests/data/sample-models/models--meta-llama--Llama-3.1-70B-Instruct/
......
...@@ -331,7 +331,16 @@ cd lib/bindings/python ...@@ -331,7 +331,16 @@ cd lib/bindings/python
maturin develop --uv maturin develop --uv
``` ```
## 6. Install the Wheel ## 6. Install GPU Memory Service
The GPU Memory Service is a Python package with a C++ extension. It requires only Python development headers and a C++ compiler (g++).
```bash
cd $PROJECT_ROOT
uv pip install -e lib/gpu_memory_service
```
## 7. Install the Wheel
``` ```
cd $PROJECT_ROOT cd $PROJECT_ROOT
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service component for Dynamo.
This module provides the Dynamo component wrapper around the gpu_memory_service package.
The core functionality is in the gpu_memory_service package; this module provides:
- CLI entry point (python -m dynamo.gpu_memory_service)
- Re-exports for backwards compatibility
"""
# Re-export core functionality from gpu_memory_service package
from gpu_memory_service import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
# Re-export extensions (built separately)
try:
from gpu_memory_service.client.torch.extensions import _allocator_ext
except (ImportError, OSError):
_allocator_ext = None
# Re-export module utilities
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# Core
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor utilities
"register_module_tensors",
"materialize_module_from_gms",
# Extensions
"_allocator_ext",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.gpu_memory_service.server import main
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Argument parsing for GPU Memory Service server component."""
import argparse
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class Config:
"""Configuration for GPU Memory Service server."""
# GPU Memory Service specific
device: int
socket_path: str
verbose: bool
def parse_args() -> Config:
"""Parse command line arguments for GPU Memory Service server."""
parser = argparse.ArgumentParser(
description="GPU Memory Service allocation server for Dynamo."
)
# GPU Memory Service specific arguments
parser.add_argument(
"--device",
type=int,
required=True,
help="CUDA device ID to manage memory for.",
)
parser.add_argument(
"--socket-path",
type=str,
default=None,
help="Path for Unix domain socket. Default: /tmp/gpu_memory_service_{device}.sock. "
"Supports {device} placeholder for multi-GPU setups.",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging.",
)
args = parser.parse_args()
# Generate default socket path if not provided
socket_path = args.socket_path
if socket_path is None:
socket_path = f"/tmp/gpu_memory_service_{args.device}.sock"
else:
# Expand {device} placeholder
socket_path = socket_path.format(device=args.device)
config = Config(
device=args.device,
socket_path=socket_path,
verbose=args.verbose,
)
return config
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service allocation server component for Dynamo.
This component wraps the GMSRPCServer from gpu_memory_service to manage
GPU memory allocations with connection-based RW/RO locking.
Workers connect via the socket path, which should be passed to vLLM/SGLang via:
--load-format gpu_memory_service
--model-loader-extra-config '{"gpu_memory_service_socket_path": "/tmp/gpu_memory_service_{device}.sock"}'
Usage:
python -m dynamo.gpu_memory_service --device 0
python -m dynamo.gpu_memory_service --device 0 --socket-path /tmp/gpu_memory_service_{device}.sock
"""
import asyncio
import logging
import signal
import uvloop
from gpu_memory_service.server import GMSRPCServer
from .args import parse_args
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
async def worker() -> None:
"""Main async worker function."""
config = parse_args()
# Configure logging level
if config.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("dynamo.gpu_memory_service").setLevel(logging.DEBUG)
logger.info(f"Starting GPU Memory Service Server for device {config.device}")
logger.info(f"Socket path: {config.socket_path}")
server = GMSRPCServer(config.socket_path, device=config.device)
# Set up shutdown handling
shutdown_event = asyncio.Event()
def signal_handler():
logger.info("Received shutdown signal")
shutdown_event.set()
loop = asyncio.get_running_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
await server.start()
logger.info("GPU Memory Service Server ready, waiting for connections...")
logger.info(
f"To connect vLLM workers, use: --load-format gpu_memory_service "
f'--model-loader-extra-config \'{{"gpu_memory_service_socket_path": "{config.socket_path}"}}\''
)
# Wait for shutdown signal
try:
await shutdown_event.wait()
finally:
logger.info("Shutting down GPU Memory Service Server...")
await server.stop()
logger.info("GPU Memory Service Server shutdown complete")
def main() -> None:
"""Entry point for GPU Memory Service server."""
uvloop.install()
asyncio.run(worker())
if __name__ == "__main__":
main()
...@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen ...@@ -27,6 +27,7 @@ ARG EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inferen
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -431,6 +432,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
############################################## ##############################################
########## Runtime image ############## ########## Runtime image ##############
############################################## ##############################################
...@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \ ...@@ -502,10 +510,19 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
# Install dynamo wheels (runtime packages only, no test dependencies) # Install dynamo wheels (runtime packages only, no test dependencies)
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
...@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi ...@@ -593,10 +610,19 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--requirement /tmp/requirements.test.txt --requirement /tmp/requirements.test.txt
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "$ENABLE_KVBM" = "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
......
...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG ...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -442,6 +443,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
################################## ##################################
########## Runtime Image ######### ########## Runtime Image #########
################################## ##################################
...@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src ...@@ -500,12 +508,21 @@ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /workspace/nixl/build/src
ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}" ENV SGLANG_VERSION="${RUNTIME_IMAGE_TAG%%-*}"
# Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages) # Install packages as root to ensure they go to system location (/usr/local/lib/python3.12/dist-packages)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN --mount=type=bind,source=.,target=/mnt/local_src \ RUN --mount=type=bind,source=.,target=/mnt/local_src \
pip install --no-cache-dir --break-system-packages \ pip install --no-cache-dir --break-system-packages \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \
sglang==${SGLANG_VERSION} sglang==${SGLANG_VERSION} && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
pip install --no-cache-dir --break-system-packages "$GMS_WHEEL"; \
fi
# Install common and test dependencies as root # Install common and test dependencies as root
RUN --mount=type=bind,source=.,target=/mnt/local_src \ RUN --mount=type=bind,source=.,target=/mnt/local_src \
......
...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG ...@@ -36,6 +36,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -454,6 +455,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
################################################## ##################################################
########## Framework Builder Stage ############## ########## Framework Builder Stage ##############
################################################## ##################################################
...@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/ ...@@ -770,12 +778,21 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
# Install dynamo, NIXL, and dynamo-specific dependencies # Install dynamo, NIXL, and dynamo-specific dependencies
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
RUN uv pip install \ RUN uv pip install \
--no-cache \ --no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install --no-cache "$GMS_WHEEL"; \
fi && \
if [ "${ENABLE_KVBM}" = "true" ]; then \ if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
......
...@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG ...@@ -41,6 +41,7 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
ARG ENABLE_MEDIA_NIXL ARG ENABLE_MEDIA_NIXL
ARG ENABLE_MEDIA_FFMPEG ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
...@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -481,6 +482,13 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
# Build gpu_memory_service wheel (C++ extension only needs Python headers, no CUDA/torch)
ARG ENABLE_GPU_MEMORY_SERVICE
RUN if [ "$ENABLE_GPU_MEMORY_SERVICE" = "true" ]; then \
source ${VIRTUAL_ENV}/bin/activate && \
uv build --wheel --out-dir /opt/dynamo/dist /opt/dynamo/lib/gpu_memory_service; \
fi
######################################################## ########################################################
########## Framework Development Image ################ ########## Framework Development Image ################
######################################################## ########################################################
...@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin ...@@ -605,6 +613,7 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin
COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/ COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=dynamo_base /usr/local/cuda/lib64/stubs/ /usr/local/cuda/lib64/stubs/
RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\ RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\ ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so
...@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/ ...@@ -744,11 +753,20 @@ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
# Install dynamo, NIXL, and dynamo-specific dependencies # Install dynamo, NIXL, and dynamo-specific dependencies
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path> # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG ENABLE_GPU_MEMORY_SERVICE
COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ COPY --chmod=775 --chown=dynamo:0 --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
RUN uv pip install \ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \ /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_GPU_MEMORY_SERVICE}" = "true" ]; then \
GMS_WHEEL=$(ls /opt/dynamo/wheelhouse/gpu_memory_service*.whl 2>/dev/null | head -1); \
if [ -z "$GMS_WHEEL" ]; then \
echo "ERROR: ENABLE_GPU_MEMORY_SERVICE is true but no gpu_memory_service wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$GMS_WHEEL"; \
fi && \
if [ "${ENABLE_KVBM}" = "true" ]; then \ if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \ KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \ if [ -z "$KVBM_WHEEL" ]; then \
...@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \ ...@@ -823,6 +841,7 @@ RUN cd /usr/local/lib && \
ldconfig ldconfig
USER dynamo USER dynamo
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
......
...@@ -156,6 +156,10 @@ PUSH="" ...@@ -156,6 +156,10 @@ PUSH=""
# or can be explicitly enabled via --enable-kvbm flag # or can be explicitly enabled via --enable-kvbm flag
ENABLE_KVBM=false ENABLE_KVBM=false
# GPU Memory Service - default disabled, enabled automatically for VLLM/SGLANG
# or can be explicitly enabled via --enable-gpu-memory-service flag
ENABLE_GPU_MEMORY_SERVICE=false
# sccache configuration for S3 # sccache configuration for S3
USE_SCCACHE="" USE_SCCACHE=""
SCCACHE_BUCKET="" SCCACHE_BUCKET=""
...@@ -343,6 +347,9 @@ get_options() { ...@@ -343,6 +347,9 @@ get_options() {
--enable-kvbm) --enable-kvbm)
ENABLE_KVBM=true ENABLE_KVBM=true
;; ;;
--enable-gpu-memory-service)
ENABLE_GPU_MEMORY_SERVICE=true
;;
--enable-media-nixl) --enable-media-nixl)
ENABLE_MEDIA_NIXL=true ENABLE_MEDIA_NIXL=true
;; ;;
...@@ -539,6 +546,7 @@ show_help() { ...@@ -539,6 +546,7 @@ show_help() {
echo " [--release-build perform a release build]" echo " [--release-build perform a release build]"
echo " [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]" echo " [--make-efa Adds AWS EFA layer on top of the built image (works with any target)]"
echo " [--enable-kvbm Enables KVBM support in Python 3.12]" echo " [--enable-kvbm Enables KVBM support in Python 3.12]"
echo " [--enable-gpu-memory-service Enables GPU Memory Service support]"
echo " [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]" echo " [--enable-media-nixl Enable media processing with NIXL support (default: true for frameworks, false for none)]"
echo " [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]" echo " [--enable-media-ffmpeg Enable media processing with FFMPEG support (default: true for frameworks, false for none)]"
echo " [--use-sccache enable sccache for Rust/C/C++ compilation caching]" echo " [--use-sccache enable sccache for Rust/C/C++ compilation caching]"
...@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then ...@@ -831,6 +839,20 @@ if [[ ${ENABLE_KVBM} == "true" ]]; then
BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} " BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} "
fi fi
# ENABLE_GPU_MEMORY_SERVICE: Used in Dockerfiles for gpu_memory_service wheel.
# Declared but not currently used in Dockerfile.trtllm.
# Force GPU Memory Service to be enabled for VLLM and SGLANG frameworks
if [[ $FRAMEWORK == "VLLM" ]] || [[ $FRAMEWORK == "SGLANG" ]]; then
echo "Forcing enable_gpu_memory_service to true in ${FRAMEWORK} image build"
ENABLE_GPU_MEMORY_SERVICE=true
fi
# For other frameworks, ENABLE_GPU_MEMORY_SERVICE defaults to false unless --enable-gpu-memory-service flag was provided
if [[ ${ENABLE_GPU_MEMORY_SERVICE} == "true" ]]; then
echo "Enabling GPU Memory Service in the dynamo image"
BUILD_ARGS+=" --build-arg ENABLE_GPU_MEMORY_SERVICE=${ENABLE_GPU_MEMORY_SERVICE} "
fi
# ENABLE_MEDIA_NIXL: Enable media processing with NIXL support # ENABLE_MEDIA_NIXL: Enable media processing with NIXL support
# Used in base Dockerfile for maturin build feature flag. # Used in base Dockerfile for maturin build feature flag.
# Can be explicitly overridden with --enable-media-nixl flag # Can be explicitly overridden with --enable-media-nixl flag
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service - out-of-process GPU memory manager.
The GPU Memory Service decouples ownership of GPU memory from the processes
that use it, enabling zero-copy sharing and data survival across process crashes.
Package structure:
- common/: Shared types and protocol (used by both server and client)
- server/: Allocation server daemon (no CUDA context required)
- client/: Client library for memory management
- client/torch/: PyTorch integration (allocator, tensor, module, extensions)
Primary client API:
from gpu_memory_service import (
GMSClientMemoryManager,
get_or_create_gms_client_memory_manager,
get_gms_client_memory_manager,
)
Server API:
from gpu_memory_service.server import GMSRPCServer
"""
# Primary client exports
from gpu_memory_service.client.memory_manager import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
)
# PyTorch integration (GMS client memory manager)
from gpu_memory_service.client.torch.allocator import (
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
__all__ = [
# Client
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service client library.
This module provides the client-side components for interacting with the
GPU Memory Service:
- GMSClientMemoryManager: Manages local VA mappings of remote GPU memory
- GMSRPCClient: Low-level RPC client (pure Python, no PyTorch dependency)
For PyTorch integration (MemPool, tensor utilities), see gpu_memory_service.client.torch.
"""
from gpu_memory_service.client.memory_manager import (
GMSClientMemoryManager,
StaleMemoryLayoutError,
)
from gpu_memory_service.client.rpc import GMSRPCClient
__all__ = [
"GMSClientMemoryManager",
"StaleMemoryLayoutError",
"GMSRPCClient",
]
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Client-side CUDA VMM utilities.
These functions wrap CUDA driver API calls used by the client memory manager
for importing, mapping, and unmapping GPU memory.
"""
from __future__ import annotations
from cuda.bindings import driver as cuda
from gpu_memory_service.common.cuda_vmm_utils import check_cuda_result
from gpu_memory_service.common.types import GrantedLockType
def import_handle_from_fd(fd: int) -> int:
"""Import a CUDA memory handle from a file descriptor.
Args:
fd: POSIX file descriptor received via SCM_RIGHTS.
Returns:
CUDA memory handle.
"""
result, handle = cuda.cuMemImportFromShareableHandle(
fd,
cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
)
check_cuda_result(result, "cuMemImportFromShareableHandle")
return int(handle)
def reserve_va(size: int, granularity: int) -> int:
"""Reserve virtual address space.
Args:
size: Size in bytes (should be aligned to granularity).
granularity: VMM allocation granularity.
Returns:
Reserved virtual address.
"""
result, va = cuda.cuMemAddressReserve(size, granularity, 0, 0)
check_cuda_result(result, "cuMemAddressReserve")
return int(va)
def free_va(va: int, size: int) -> None:
"""Free a virtual address reservation.
Args:
va: Virtual address to free.
size: Size of the reservation.
"""
(result,) = cuda.cuMemAddressFree(va, size)
check_cuda_result(result, "cuMemAddressFree")
def map_to_va(va: int, size: int, handle: int) -> None:
"""Map a CUDA handle to a virtual address.
Args:
va: Virtual address (must be reserved).
size: Size of the mapping.
handle: CUDA memory handle.
"""
(result,) = cuda.cuMemMap(va, size, 0, handle, 0)
check_cuda_result(result, "cuMemMap")
def set_access(va: int, size: int, device: int, access: GrantedLockType) -> None:
"""Set access permissions for a mapped region.
Args:
va: Virtual address.
size: Size of the region.
device: CUDA device index.
access: Access mode - RO for read-only, RW for read-write.
"""
acc = cuda.CUmemAccessDesc()
acc.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
acc.location.id = device
acc.flags = (
cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READ
if access == GrantedLockType.RO
else cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
)
(result,) = cuda.cuMemSetAccess(va, size, [acc], 1)
check_cuda_result(result, "cuMemSetAccess")
def unmap(va: int, size: int) -> None:
"""Unmap a virtual address region.
Args:
va: Virtual address to unmap.
size: Size of the mapping.
"""
(result,) = cuda.cuMemUnmap(va, size)
check_cuda_result(result, "cuMemUnmap")
def release_handle(handle: int) -> None:
"""Release a CUDA memory handle.
Args:
handle: CUDA memory handle to release.
"""
(result,) = cuda.cuMemRelease(handle)
check_cuda_result(result, "cuMemRelease")
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""GPU Memory Service RPC Client.
Low-level RPC client stub. The client provides a simple interface for acquiring
locks and performing allocation operations. The socket connection IS the lock.
This module has NO PyTorch dependency.
Usage:
# Writer (acquires RW lock in constructor)
with GMSRPCClient(socket_path, lock_type=RequestedLockType.RW) as client:
alloc_id, aligned_size = client.allocate(size=1024*1024)
fd = client.export(alloc_id)
# ... write weights using fd ...
client.commit()
# Lock released on exit
# Reader (acquires RO lock in constructor)
client = GMSRPCClient(socket_path, lock_type=RequestedLockType.RO)
if client.committed: # Check if weights are valid
allocations = client.list_allocations()
for alloc in allocations:
fd = client.export(alloc["allocation_id"])
# ... import and map fd ...
# Keep connection open during inference!
# client.close() only when done with inference
"""
import logging
import socket
from typing import Dict, List, Optional, Tuple, Type, TypeVar
from gpu_memory_service.common.protocol.messages import (
AllocateRequest,
AllocateResponse,
ClearAllRequest,
ClearAllResponse,
CommitRequest,
CommitResponse,
ErrorResponse,
ExportRequest,
FreeRequest,
FreeResponse,
GetAllocationRequest,
GetAllocationResponse,
GetAllocationStateRequest,
GetAllocationStateResponse,
GetLockStateRequest,
GetLockStateResponse,
GetStateHashRequest,
GetStateHashResponse,
HandshakeRequest,
HandshakeResponse,
ListAllocationsRequest,
ListAllocationsResponse,
MetadataDeleteRequest,
MetadataDeleteResponse,
MetadataGetRequest,
MetadataGetResponse,
MetadataListRequest,
MetadataListResponse,
MetadataPutRequest,
MetadataPutResponse,
)
from gpu_memory_service.common.protocol.wire import recv_message_sync, send_message_sync
from gpu_memory_service.common.types import (
RW_REQUIRED,
GrantedLockType,
RequestedLockType,
)
T = TypeVar("T")
logger = logging.getLogger(__name__)
class GMSRPCClient:
"""GPU Memory Service RPC Client.
CRITICAL: Socket connection IS the lock.
- Constructor blocks until lock is acquired
- close() releases the lock
- committed property tells readers if weights are valid
For writers (lock_type=RequestedLockType.RW):
- Use context manager (with statement) for automatic lock release
- Call commit() after weights are written
- Call clear_all() before loading new model
For readers (lock_type=RequestedLockType.RO):
- Check committed property after construction
- Keep connection open during inference lifetime
- Only call close() when shutting down or allowing weight updates
"""
def __init__(
self,
socket_path: str,
lock_type: RequestedLockType = RequestedLockType.RO,
timeout_ms: Optional[int] = None,
):
"""Connect to Allocation Server and acquire lock.
Args:
socket_path: Path to server's Unix domain socket
lock_type: Requested lock type (RW, RO, or RW_OR_RO)
timeout_ms: Timeout in milliseconds for lock acquisition.
None means wait indefinitely.
Raises:
ConnectionError: If connection fails
TimeoutError: If timeout_ms expires waiting for lock
"""
self.socket_path = socket_path
self._requested_lock_type = lock_type
self._socket: Optional[socket.socket] = None
self._recv_buffer = bytearray()
self._committed = False
self._granted_lock_type: Optional[GrantedLockType] = None
# Connect and acquire lock
self._connect(timeout_ms=timeout_ms)
def _connect(self, timeout_ms: Optional[int]) -> None:
"""Connect to server and perform handshake (lock acquisition)."""
self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
self._socket.connect(self.socket_path)
except FileNotFoundError:
raise ConnectionError(f"Server not running at {self.socket_path}") from None
except Exception as e:
raise ConnectionError(f"Failed to connect: {e}") from e
# Send handshake (this IS lock acquisition)
request = HandshakeRequest(
lock_type=self._requested_lock_type, timeout_ms=timeout_ms
)
send_message_sync(self._socket, request)
# Receive response (may block waiting for lock)
response, _, self._recv_buffer = recv_message_sync(
self._socket, self._recv_buffer
)
if isinstance(response, ErrorResponse):
self._socket.close()
self._socket = None
raise ConnectionError(f"Handshake error: {response.error}")
if not isinstance(response, HandshakeResponse):
self._socket.close()
self._socket = None
raise ConnectionError(f"Unexpected response: {type(response)}")
if not response.success:
self._socket.close()
self._socket = None
raise TimeoutError("Timeout waiting for lock")
self._committed = response.committed
# Store granted lock type (may differ from requested for rw_or_ro mode)
if response.granted_lock_type is not None:
self._granted_lock_type = response.granted_lock_type
elif self._requested_lock_type == RequestedLockType.RW:
self._granted_lock_type = GrantedLockType.RW
else:
self._granted_lock_type = GrantedLockType.RO
logger.info(
f"Connected with {self._requested_lock_type.value} lock (granted={self._granted_lock_type.value}), "
f"committed={self._committed}"
)
@property
def committed(self) -> bool:
"""Check if weights are committed (valid)."""
return self._committed
@property
def lock_type(self) -> Optional[GrantedLockType]:
"""Get the lock type actually granted by the server.
For rw_or_ro mode, this tells you whether RW or RO was granted.
"""
return self._granted_lock_type
@property
def is_connected(self) -> bool:
"""Check if client is connected."""
return self._socket is not None
def _send_recv(self, request) -> Tuple[object, int]:
"""Send request and receive response. Returns (response, fd)."""
if not self._socket:
raise RuntimeError("Client not connected")
send_message_sync(self._socket, request)
response, fd, self._recv_buffer = recv_message_sync(
self._socket, self._recv_buffer
)
if isinstance(response, ErrorResponse):
raise RuntimeError(f"Server error: {response.error}")
return response, fd
def _call(self, request, response_type: Type[T]) -> T:
"""Send request, validate response type, return typed response."""
if type(request) in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
raise RuntimeError("Operation requires RW connection")
response, _ = self._send_recv(request)
if not isinstance(response, response_type):
raise RuntimeError(f"Unexpected response: {type(response)}")
return response
def get_lock_state(self) -> GetLockStateResponse:
return self._call(GetLockStateRequest(), GetLockStateResponse)
def get_allocation_state(self) -> GetAllocationStateResponse:
return self._call(GetAllocationStateRequest(), GetAllocationStateResponse)
def is_ready(self) -> bool:
return self.committed
def commit(self) -> bool:
"""Commit weights and release RW lock. Returns True on success."""
if CommitRequest in RW_REQUIRED and self.lock_type != GrantedLockType.RW:
raise RuntimeError("Operation requires RW connection")
try:
response, _ = self._send_recv(CommitRequest())
ok = isinstance(response, CommitResponse) and response.success
except (ConnectionResetError, BrokenPipeError, OSError) as e:
# Server closes RW socket as part of commit
logger.debug(
f"Commit saw socket error ({type(e).__name__}); verifying via RO connect"
)
self.close()
try:
ro = GMSRPCClient(
self.socket_path, lock_type=RequestedLockType.RO, timeout_ms=1000
)
try:
ok = ro.committed
finally:
ro.close()
except TimeoutError:
ok = False
if ok:
self._committed = True
self.close()
logger.info("Committed weights and released RW connection")
return True
return False
def allocate(self, size: int, tag: str = "default") -> Tuple[str, int]:
"""Returns (allocation_id, aligned_size)."""
r = self._call(AllocateRequest(size=size, tag=tag), AllocateResponse)
return r.allocation_id, r.aligned_size
def export(self, allocation_id: str) -> int:
"""Export allocation as POSIX FD. Caller must close."""
_, fd = self._send_recv(ExportRequest(allocation_id=allocation_id))
if fd < 0:
raise RuntimeError("No FD received from server")
return fd
def get_allocation(self, allocation_id: str) -> GetAllocationResponse:
return self._call(
GetAllocationRequest(allocation_id=allocation_id), GetAllocationResponse
)
def list_allocations(self, tag: Optional[str] = None) -> List[Dict]:
return self._call(
ListAllocationsRequest(tag=tag), ListAllocationsResponse
).allocations
def free(self, allocation_id: str) -> bool:
return self._call(
FreeRequest(allocation_id=allocation_id), FreeResponse
).success
def clear_all(self) -> int:
return self._call(ClearAllRequest(), ClearAllResponse).cleared_count
def metadata_put(
self, key: str, allocation_id: str, offset_bytes: int, value: bytes
) -> bool:
req = MetadataPutRequest(
key=key, allocation_id=allocation_id, offset_bytes=offset_bytes, value=value
)
return self._call(req, MetadataPutResponse).success
def metadata_get(self, key: str) -> Optional[tuple[str, int, bytes]]:
"""Returns (allocation_id, offset_bytes, value) or None if not found."""
r = self._call(MetadataGetRequest(key=key), MetadataGetResponse)
return (r.allocation_id, r.offset_bytes, r.value) if r.found else None
def metadata_delete(self, key: str) -> bool:
return self._call(
MetadataDeleteRequest(key=key), MetadataDeleteResponse
).deleted
def metadata_list(self, prefix: str = "") -> List[str]:
return self._call(MetadataListRequest(prefix=prefix), MetadataListResponse).keys
def get_memory_layout_hash(self) -> str:
"""Get state hash (hash of allocations + metadata). Empty if not committed."""
return self._call(
GetStateHashRequest(), GetStateHashResponse
).memory_layout_hash
def close(self) -> None:
"""Close connection and release lock."""
if self._socket:
try:
self._socket.close()
except Exception:
pass
self._socket = None
lock_str = self.lock_type.value if self.lock_type else "unknown"
logger.info(f"Closed {lock_str} connection")
def __enter__(self) -> "GMSRPCClient":
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit."""
self.close()
def __del__(self):
"""Destructor: warn if connection not closed."""
if self._socket:
logger.warning("GMSRPCClient not closed properly")
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""PyTorch integration for GPU Memory Service.
This module provides PyTorch-specific functionality:
- Memory manager singleton management
- Tensor utilities (metadata, registration, materialization)
- C++ extension for CUDAPluggableAllocator
"""
from gpu_memory_service.client.torch.allocator import (
get_gms_client_memory_manager,
get_or_create_gms_client_memory_manager,
)
from gpu_memory_service.client.torch.module import (
materialize_module_from_gms,
register_module_tensors,
)
__all__ = [
# GMS client memory manager
"get_or_create_gms_client_memory_manager",
"get_gms_client_memory_manager",
# Tensor operations (public API)
"register_module_tensors",
"materialize_module_from_gms",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment