[CostModel] Introduce cuda driver api to get precise shared memory capacity (#317)

* [Enhancement] Introduce CUDA driver module and refactor CUDA device handling - Added a new `cuda_driver` module to encapsulate CUDA device properties and functionalities. - Updated `CUDA` class in `cuda.py` to utilize the new driver for fetching device name and shared memory capabilities. - Introduced `get_device_name` and `get_shared_memory_per_block` functions in the `cuda_driver` for improved device property management. - This refactor enhances code organization and maintainability while improving the handling of CUDA device attributes. * [Refactor] Clean up whitespace in CUDA-related files - Removed unnecessary blank lines in `cuda.py`, `__init__.py`, and `cuda_driver.py` to improve code readability and maintainability. - This change enhances the overall organization of the codebase without altering functionality. --------- Co-authored-by: LeiWang1999 <wyatuestc@gmail.com>

[CostModel] Introduce cuda driver api to get precise shared memory capacity (#317)
* [Enhancement] Introduce CUDA driver module and refactor CUDA device handling - Added a new `cuda_driver` module to encapsulate CUDA device properties and functionalities. - Updated `CUDA` class in `cuda.py` to utilize the new driver for fetching device name and shared memory capabilities. - Introduced `get_device_name` and `get_shared_memory_per_block` functions in the `cuda_driver` for improved device property management. - This refactor enhances code organization and maintainability while improving the handling of CUDA device attributes. * [Refactor] Clean up whitespace in CUDA-related files - Removed unnecessary blank lines in `cuda.py`, `__init__.py`, and `cuda_driver.py` to improve code readability and maintainability. - This change enhances the overall organization of the codebase without altering functionality. --------- Co-authored-by: LeiWang1999 <wyatuestc@gmail.com>
3b660b67 · Lei Wang · LeiWang1999 · 19c85907 · 3b660b67 · 3b660b67
Commit 3b660b67 authored Apr 02, 2025 by Lei Wang Committed by LeiWang1999 Apr 02, 2025
3 changed files
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -2,6 +2,7 @@ import tvm
 from tvm.target import Target
 from .arch_base import TileDevice
 from typing import List, Union
+from .driver import cuda_driver
 def check_sm_version(arch: str) -> int:
@@ -112,9 +113,11 @@ class CUDA(TileDevice):
        device = tvm.runtime.cuda(0)
        if not device.exist:
            raise RuntimeError("Cannot find cuda device 0.")
+        self.name = cuda_driver.get_device_name()
        self.device: tvm.runtime.Device = device
        self.platform: str = "CUDA"
-        self.smem_cap = device.max_shared_memory_per_block
+        # TODO(lei): maybe static shared memory, can be improved in future
+        self.smem_cap = cuda_driver.get_shared_memory_per_block()
        self.compute_max_core = device.multi_processor_count
        self.warp_size = device.warp_size
        self.compute_capability = device.compute_version.replace(".", "")

--- a/tilelang/carver/arch/driver/__init__.py
+++ b/tilelang/carver/arch/driver/__init__.py
+from .cuda_driver import (
+    get_cuda_device_properties,  # noqa: F401
+    get_device_name,  # noqa: F401
+    get_shared_memory_per_block,  # noqa: F401
+    get_device_attribute,  # noqa: F401
+    get_max_dynamic_shared_size_bytes,  # noqa: F401
+)
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
+import ctypes
+import sys
+from typing import Optional
+class cudaDeviceProp(ctypes.Structure):
+    _fields_ = [
+        ("name", ctypes.c_char * 256),
+        ("uuid", ctypes.c_byte * 16),  # cudaUUID_t
+        ("luid", ctypes.c_char * 8),
+        ("luidDeviceNodeMask", ctypes.c_uint),
+        ("totalGlobalMem", ctypes.c_size_t),
+        ("sharedMemPerBlock", ctypes.c_size_t),
+        ("regsPerBlock", ctypes.c_int),
+        ("warpSize", ctypes.c_int),
+        ("memPitch", ctypes.c_size_t),
+        ("maxThreadsPerBlock", ctypes.c_int),
+        ("maxThreadsDim", ctypes.c_int * 3),
+        ("maxGridSize", ctypes.c_int * 3),
+        ("clockRate", ctypes.c_int),
+        ("totalConstMem", ctypes.c_size_t),
+        ("major", ctypes.c_int),
+        ("minor", ctypes.c_int),
+        ("textureAlignment", ctypes.c_size_t),
+        ("texturePitchAlignment", ctypes.c_size_t),
+        ("deviceOverlap", ctypes.c_int),
+        ("multiProcessorCount", ctypes.c_int),
+        ("kernelExecTimeoutEnabled", ctypes.c_int),
+        ("integrated", ctypes.c_int),
+        ("canMapHostMemory", ctypes.c_int),
+        ("computeMode", ctypes.c_int),
+        ("maxTexture1D", ctypes.c_int),
+        ("maxTexture1DMipmap", ctypes.c_int),
+        ("maxTexture1DLinear", ctypes.c_int),
+        ("maxTexture2D", ctypes.c_int * 2),
+        ("maxTexture2DMipmap", ctypes.c_int * 2),
+        ("maxTexture2DLinear", ctypes.c_int * 3),
+        ("maxTexture2DGather", ctypes.c_int * 2),
+        ("maxTexture3D", ctypes.c_int * 3),
+        ("maxTexture3DAlt", ctypes.c_int * 3),
+        ("maxTextureCubemap", ctypes.c_int),
+        ("maxTexture1DLayered", ctypes.c_int * 2),
+        ("maxTexture2DLayered", ctypes.c_int * 3),
+        ("maxTextureCubemapLayered", ctypes.c_int * 2),
+        ("maxSurface1D", ctypes.c_int),
+        ("maxSurface2D", ctypes.c_int * 2),
+        ("maxSurface3D", ctypes.c_int * 3),
+        ("maxSurface1DLayered", ctypes.c_int * 2),
+        ("maxSurface2DLayered", ctypes.c_int * 3),
+        ("maxSurfaceCubemap", ctypes.c_int),
+        ("maxSurfaceCubemapLayered", ctypes.c_int * 2),
+        ("surfaceAlignment", ctypes.c_size_t),
+        ("concurrentKernels", ctypes.c_int),
+        ("ECCEnabled", ctypes.c_int),
+        ("pciBusID", ctypes.c_int),
+        ("pciDeviceID", ctypes.c_int),
+        ("pciDomainID", ctypes.c_int),
+        ("tccDriver", ctypes.c_int),
+        ("asyncEngineCount", ctypes.c_int),
+        ("unifiedAddressing", ctypes.c_int),
+        ("memoryClockRate", ctypes.c_int),
+        ("memoryBusWidth", ctypes.c_int),
+        ("l2CacheSize", ctypes.c_int),
+        ("persistingL2CacheMaxSize", ctypes.c_int),
+        ("maxThreadsPerMultiProcessor", ctypes.c_int),
+        ("streamPrioritiesSupported", ctypes.c_int),
+        ("globalL1CacheSupported", ctypes.c_int),
+        ("localL1CacheSupported", ctypes.c_int),
+        ("sharedMemPerMultiprocessor", ctypes.c_size_t),
+        ("regsPerMultiprocessor", ctypes.c_int),
+        ("managedMemory", ctypes.c_int),
+        ("isMultiGpuBoard", ctypes.c_int),
+        ("multiGpuBoardGroupID", ctypes.c_int),
+        ("reserved2", ctypes.c_int * 2),
+        ("reserved1", ctypes.c_int * 1),
+        ("reserved", ctypes.c_int * 60)
+    ]
+def get_cuda_device_properties(device_id: int = 0) -> Optional[cudaDeviceProp]:
+    if sys.platform == "win32":
+        libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
+    else:
+        libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
+    prop = cudaDeviceProp()
+    cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
+    cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
+    cudaGetDeviceProperties.restype = ctypes.c_int
+    ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
+    if ret == 0:
+        return prop
+    else:
+        return None
+def get_device_name(device_id: int = 0) -> Optional[str]:
+    prop = get_cuda_device_properties(device_id)
+    if prop:
+        return prop.name.decode()
+    else:
+        raise RuntimeError("Failed to get device properties.")
+def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> Optional[int]:
+    assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
+    prop = get_cuda_device_properties(device_id)
+    if prop:
+        # Convert size_t to int to avoid overflow issues
+        shared_mem = int(prop.sharedMemPerBlock)
+        if format == "bytes":
+            return shared_mem
+        elif format == "kb":
+            return shared_mem // 1024  # 使用整除
+        elif format == "mb":
+            return shared_mem // (1024 * 1024)  # 使用整除
+        else:
+            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    else:
+        raise RuntimeError("Failed to get device properties.")
+def get_device_attribute(attr: int, device_id: int = 0) -> int:
+    try:
+        if sys.platform == "win32":
+            libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
+        else:
+            libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
+        value = ctypes.c_int()
+        cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
+        cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
+        cudaDeviceGetAttribute.restype = ctypes.c_int
+        ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
+        if ret != 0:
+            raise RuntimeError(f"cudaDeviceGetAttribute failed with error {ret}")
+        return value.value
+    except Exception as e:
+        print(f"Error getting device attribute: {str(e)}")
+        return None
+def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes") -> Optional[int]:
+    """获取设备支持的最大动态共享内存大小"""
+    assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
+    prop = get_cuda_device_properties(device_id)
+    if prop:
+        # Convert size_t to int to avoid overflow issues
+        shared_mem = int(prop.sharedMemPerMultiprocessor)
+        if format == "bytes":
+            return shared_mem
+        elif format == "kb":
+            return shared_mem // 1024  # 使用整除
+        elif format == "mb":
+            return shared_mem // (1024 * 1024)  # 使用整除
+        else:
+            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    else:
+        raise RuntimeError("Failed to get device properties.")