"...composable_kernel_onnxruntime.git" did not exist on "f03a1738d93c8ffccc570e8121e0a261e9950fa6"
Commit 3b660b67 authored by Lei Wang's avatar Lei Wang Committed by LeiWang1999
Browse files

[CostModel] Introduce cuda driver api to get precise shared memory capacity (#317)



* [Enhancement] Introduce CUDA driver module and refactor CUDA device handling

- Added a new `cuda_driver` module to encapsulate CUDA device properties and functionalities.
- Updated `CUDA` class in `cuda.py` to utilize the new driver for fetching device name and shared memory capabilities.
- Introduced `get_device_name` and `get_shared_memory_per_block` functions in the `cuda_driver` for improved device property management.
- This refactor enhances code organization and maintainability while improving the handling of CUDA device attributes.

* [Refactor] Clean up whitespace in CUDA-related files

- Removed unnecessary blank lines in `cuda.py`, `__init__.py`, and `cuda_driver.py` to improve code readability and maintainability.
- This change enhances the overall organization of the codebase without altering functionality.

---------
Co-authored-by: default avatarLeiWang1999 <wyatuestc@gmail.com>
parent 19c85907
...@@ -2,6 +2,7 @@ import tvm ...@@ -2,6 +2,7 @@ import tvm
from tvm.target import Target from tvm.target import Target
from .arch_base import TileDevice from .arch_base import TileDevice
from typing import List, Union from typing import List, Union
from .driver import cuda_driver
def check_sm_version(arch: str) -> int: def check_sm_version(arch: str) -> int:
...@@ -112,9 +113,11 @@ class CUDA(TileDevice): ...@@ -112,9 +113,11 @@ class CUDA(TileDevice):
device = tvm.runtime.cuda(0) device = tvm.runtime.cuda(0)
if not device.exist: if not device.exist:
raise RuntimeError("Cannot find cuda device 0.") raise RuntimeError("Cannot find cuda device 0.")
self.name = cuda_driver.get_device_name()
self.device: tvm.runtime.Device = device self.device: tvm.runtime.Device = device
self.platform: str = "CUDA" self.platform: str = "CUDA"
self.smem_cap = device.max_shared_memory_per_block # TODO(lei): maybe static shared memory, can be improved in future
self.smem_cap = cuda_driver.get_shared_memory_per_block()
self.compute_max_core = device.multi_processor_count self.compute_max_core = device.multi_processor_count
self.warp_size = device.warp_size self.warp_size = device.warp_size
self.compute_capability = device.compute_version.replace(".", "") self.compute_capability = device.compute_version.replace(".", "")
......
from .cuda_driver import (
get_cuda_device_properties, # noqa: F401
get_device_name, # noqa: F401
get_shared_memory_per_block, # noqa: F401
get_device_attribute, # noqa: F401
get_max_dynamic_shared_size_bytes, # noqa: F401
)
import ctypes
import sys
from typing import Optional
class cudaDeviceProp(ctypes.Structure):
_fields_ = [
("name", ctypes.c_char * 256),
("uuid", ctypes.c_byte * 16), # cudaUUID_t
("luid", ctypes.c_char * 8),
("luidDeviceNodeMask", ctypes.c_uint),
("totalGlobalMem", ctypes.c_size_t),
("sharedMemPerBlock", ctypes.c_size_t),
("regsPerBlock", ctypes.c_int),
("warpSize", ctypes.c_int),
("memPitch", ctypes.c_size_t),
("maxThreadsPerBlock", ctypes.c_int),
("maxThreadsDim", ctypes.c_int * 3),
("maxGridSize", ctypes.c_int * 3),
("clockRate", ctypes.c_int),
("totalConstMem", ctypes.c_size_t),
("major", ctypes.c_int),
("minor", ctypes.c_int),
("textureAlignment", ctypes.c_size_t),
("texturePitchAlignment", ctypes.c_size_t),
("deviceOverlap", ctypes.c_int),
("multiProcessorCount", ctypes.c_int),
("kernelExecTimeoutEnabled", ctypes.c_int),
("integrated", ctypes.c_int),
("canMapHostMemory", ctypes.c_int),
("computeMode", ctypes.c_int),
("maxTexture1D", ctypes.c_int),
("maxTexture1DMipmap", ctypes.c_int),
("maxTexture1DLinear", ctypes.c_int),
("maxTexture2D", ctypes.c_int * 2),
("maxTexture2DMipmap", ctypes.c_int * 2),
("maxTexture2DLinear", ctypes.c_int * 3),
("maxTexture2DGather", ctypes.c_int * 2),
("maxTexture3D", ctypes.c_int * 3),
("maxTexture3DAlt", ctypes.c_int * 3),
("maxTextureCubemap", ctypes.c_int),
("maxTexture1DLayered", ctypes.c_int * 2),
("maxTexture2DLayered", ctypes.c_int * 3),
("maxTextureCubemapLayered", ctypes.c_int * 2),
("maxSurface1D", ctypes.c_int),
("maxSurface2D", ctypes.c_int * 2),
("maxSurface3D", ctypes.c_int * 3),
("maxSurface1DLayered", ctypes.c_int * 2),
("maxSurface2DLayered", ctypes.c_int * 3),
("maxSurfaceCubemap", ctypes.c_int),
("maxSurfaceCubemapLayered", ctypes.c_int * 2),
("surfaceAlignment", ctypes.c_size_t),
("concurrentKernels", ctypes.c_int),
("ECCEnabled", ctypes.c_int),
("pciBusID", ctypes.c_int),
("pciDeviceID", ctypes.c_int),
("pciDomainID", ctypes.c_int),
("tccDriver", ctypes.c_int),
("asyncEngineCount", ctypes.c_int),
("unifiedAddressing", ctypes.c_int),
("memoryClockRate", ctypes.c_int),
("memoryBusWidth", ctypes.c_int),
("l2CacheSize", ctypes.c_int),
("persistingL2CacheMaxSize", ctypes.c_int),
("maxThreadsPerMultiProcessor", ctypes.c_int),
("streamPrioritiesSupported", ctypes.c_int),
("globalL1CacheSupported", ctypes.c_int),
("localL1CacheSupported", ctypes.c_int),
("sharedMemPerMultiprocessor", ctypes.c_size_t),
("regsPerMultiprocessor", ctypes.c_int),
("managedMemory", ctypes.c_int),
("isMultiGpuBoard", ctypes.c_int),
("multiGpuBoardGroupID", ctypes.c_int),
("reserved2", ctypes.c_int * 2),
("reserved1", ctypes.c_int * 1),
("reserved", ctypes.c_int * 60)
]
def get_cuda_device_properties(device_id: int = 0) -> Optional[cudaDeviceProp]:
if sys.platform == "win32":
libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
else:
libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
prop = cudaDeviceProp()
cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
cudaGetDeviceProperties.restype = ctypes.c_int
ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
if ret == 0:
return prop
else:
return None
def get_device_name(device_id: int = 0) -> Optional[str]:
prop = get_cuda_device_properties(device_id)
if prop:
return prop.name.decode()
else:
raise RuntimeError("Failed to get device properties.")
def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> Optional[int]:
assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
prop = get_cuda_device_properties(device_id)
if prop:
# Convert size_t to int to avoid overflow issues
shared_mem = int(prop.sharedMemPerBlock)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024 # 使用整除
elif format == "mb":
return shared_mem // (1024 * 1024) # 使用整除
else:
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
else:
raise RuntimeError("Failed to get device properties.")
def get_device_attribute(attr: int, device_id: int = 0) -> int:
try:
if sys.platform == "win32":
libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
else:
libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
value = ctypes.c_int()
cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
cudaDeviceGetAttribute.restype = ctypes.c_int
ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
if ret != 0:
raise RuntimeError(f"cudaDeviceGetAttribute failed with error {ret}")
return value.value
except Exception as e:
print(f"Error getting device attribute: {str(e)}")
return None
def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes") -> Optional[int]:
"""获取设备支持的最大动态共享内存大小"""
assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
prop = get_cuda_device_properties(device_id)
if prop:
# Convert size_t to int to avoid overflow issues
shared_mem = int(prop.sharedMemPerMultiprocessor)
if format == "bytes":
return shared_mem
elif format == "kb":
return shared_mem // 1024 # 使用整除
elif format == "mb":
return shared_mem // (1024 * 1024) # 使用整除
else:
raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
else:
raise RuntimeError("Failed to get device properties.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment