cuda_utils.py

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""CUDA driver helpers shared by the GMS client and server."""

from __future__ import annotations

import os

from gpu_memory_service.common.locks import GrantedLockType
from gpu_memory_service.common.utils import fail

try:
    from cuda.bindings import driver as cuda
except ImportError:
    # Keep import-time collection working in CPU-only environments and let the
    # first real CUDA call fail with a targeted message instead.
    class _MissingCuda:
        def __getattr__(self, name):
            raise RuntimeError(
                "cuda-python is required for GPU Memory Service CUDA operations"
            )

    cuda = _MissingCuda()


def cuda_check_result(result: cuda.CUresult, name: str) -> None:
    if result != cuda.CUresult.CUDA_SUCCESS:
        err_result, err_str = cuda.cuGetErrorString(result)
        if err_result == cuda.CUresult.CUDA_SUCCESS and err_str:
            err_msg = err_str.decode() if isinstance(err_str, bytes) else str(err_str)
        else:
            err_msg = str(result)
        fail("fatal CUDA VMM error in %s: %s", name, err_msg)


def cuda_ensure_initialized() -> None:
    (result,) = cuda.cuInit(0)
    cuda_check_result(result, "cuInit")


def cumem_get_allocation_granularity(device: int) -> int:
    """Get VMM allocation granularity for a device.

    Args:
        device: CUDA device index

    Returns:
        Allocation granularity in bytes (typically 2 MiB)
    """
    prop = cuda.CUmemAllocationProp()
    prop.type = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
    prop.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
    prop.location.id = device
    prop.requestedHandleTypes = (
        cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
    )

    result, granularity = cuda.cuMemGetAllocationGranularity(
        prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
    )
    cuda_check_result(result, "cuMemGetAllocationGranularity")
    return int(granularity)


def cumem_create_tolerate_oom(size: int, device: int) -> tuple[bool, int]:
    prop = cuda.CUmemAllocationProp()
    prop.type = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
    prop.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
    prop.location.id = device
    prop.requestedHandleTypes = (
        cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
    )

    result, handle = cuda.cuMemCreate(size, prop, 0)
    if result == cuda.CUresult.CUDA_SUCCESS:
        return True, int(handle)
    if result == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY:
        return False, 0
    cuda_check_result(result, "cuMemCreate")
    return False, 0


def cumem_export_to_shareable_handle(handle: int) -> int:
    result, fd = cuda.cuMemExportToShareableHandle(
        handle,
        cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
        0,
    )
    cuda_check_result(result, "cuMemExportToShareableHandle")
    return int(fd)


def align_to_granularity(size: int, granularity: int) -> int:
    """Align size up to VMM granularity.

    Args:
        size: Size in bytes
        granularity: Allocation granularity

    Returns:
        Aligned size
    """
    return ((size + granularity - 1) // granularity) * granularity


def cumem_import_from_shareable_handle_close_fd(fd: int) -> int:
    try:
        result, handle = cuda.cuMemImportFromShareableHandle(
            fd,
            cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
        )
        cuda_check_result(result, "cuMemImportFromShareableHandle")
        return int(handle)
    finally:
        os.close(fd)


def cumem_address_reserve(size: int, granularity: int) -> int:
    result, va = cuda.cuMemAddressReserve(size, granularity, 0, 0)
    cuda_check_result(result, "cuMemAddressReserve")
    return int(va)


def cumem_address_free(va: int, size: int) -> None:
    (result,) = cuda.cuMemAddressFree(va, size)
    cuda_check_result(result, "cuMemAddressFree")


def cumem_map(va: int, size: int, handle: int) -> None:
    (result,) = cuda.cuMemMap(va, size, 0, handle, 0)
    cuda_check_result(result, "cuMemMap")


def cumem_set_access(va: int, size: int, device: int, access: GrantedLockType) -> None:
    access_desc = cuda.CUmemAccessDesc()
    access_desc.location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
    access_desc.location.id = device
    access_desc.flags = (
        cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READ
        if access == GrantedLockType.RO
        else cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
    )
    (result,) = cuda.cuMemSetAccess(va, size, [access_desc], 1)
    cuda_check_result(result, "cuMemSetAccess")


def cumem_unmap(va: int, size: int) -> None:
    (result,) = cuda.cuMemUnmap(va, size)
    cuda_check_result(result, "cuMemUnmap")


def cumem_release(handle: int) -> None:
    (result,) = cuda.cuMemRelease(handle)
    cuda_check_result(result, "cuMemRelease")


def cuda_validate_pointer(va: int) -> None:
    result, _ = cuda.cuPointerGetAttribute(
        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER, va
    )
    cuda_check_result(result, "cuPointerGetAttribute")


def cuda_synchronize() -> None:
    (result,) = cuda.cuCtxSynchronize()
    cuda_check_result(result, "cuCtxSynchronize")