Unverified Commit 2b1da821 authored by Serge Panev's avatar Serge Panev Committed by GitHub
Browse files

[NVIDIA] Add new SMs support for Spark & Thor (#11287)


Signed-off-by: default avatarSerge Panev <spanev@nvidia.com>
parent 97710ccd
...@@ -452,7 +452,15 @@ def get_available_gpu_memory( ...@@ -452,7 +452,15 @@ def get_available_gpu_memory(
if empty_cache: if empty_cache:
torch.cuda.empty_cache() torch.cuda.empty_cache()
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id) SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121) # Orin, Thor, Spark
if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
# On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
# only reports "free" memory, which can be lower than what is actually
# available due to not including cache memory. So we use the system available
# memory metric instead.
free_gpu_memory = psutil.virtual_memory().available
else:
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
elif device == "xpu": elif device == "xpu":
num_gpus = torch.xpu.device_count() num_gpus = torch.xpu.device_count()
......
...@@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a( ...@@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a(
torch::Tensor const& input_offset_by_experts, torch::Tensor const& input_offset_by_experts,
torch::Tensor const& output_scale_offset_by_experts) { torch::Tensor const& output_scale_offset_by_experts) {
auto sm_version = getSMVersion(); auto sm_version = getSMVersion();
TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
CHECK_INPUT(output, "output must be a CUDA tensor"); CHECK_INPUT(output, "output must be a CUDA tensor");
CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor"); CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
...@@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a( ...@@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
torch::Tensor const& mask, torch::Tensor const& mask,
bool use_silu_and_mul) { bool use_silu_and_mul) {
auto sm_version = getSMVersion(); auto sm_version = getSMVersion();
TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
CHECK_INPUT(output, "output must be a CUDA tensor"); CHECK_INPUT(output, "output must be a CUDA tensor");
CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor"); CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
......
...@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16; ...@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
// PTX instructions used here requires sm100a/sm103a. // PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED #if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t val; uint32_t val;
asm volatile( asm volatile(
"{\n" "{\n"
...@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { ...@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
"f"(array[7])); "f"(array[7]));
return val; return val;
#else #else
printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
__trap();
return 0; return 0;
#endif #endif
} }
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
// PTX instructions used here requires sm100a/sm103a. // PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED #if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t val; uint32_t val;
asm volatile( asm volatile(
"{\n" "{\n"
...@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { ...@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
"f"(array[3].y)); "f"(array[3].y));
return val; return val;
#else #else
printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
__trap();
return 0; return 0;
#endif #endif
} }
......
...@@ -202,7 +202,7 @@ inline int getMultiProcessorCount() { ...@@ -202,7 +202,7 @@ inline int getMultiProcessorCount() {
void scaled_fp4_quant_sm100a( void scaled_fp4_quant_sm100a(
torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) { torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
auto sm_version = getSMVersion(); auto sm_version = getSMVersion();
TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
int32_t m = input.size(0); int32_t m = input.size(0);
int32_t n = input.size(1); int32_t n = input.size(1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment