common.cuh 1.68 KB
Newer Older
1
2
3
4
#pragma once

// TODO: Let's make some of these constexpr and put in a namespace.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
#define BNB_CC_MAXWELL 500
#define BNB_CC_MAXWELL2 520
#define BNB_CC_MAXWELL2_X1 530
#define BNB_CC_PASCAL 600
#define BNB_CC_PASCAL_X2 620
#define BNB_CC_VOLTA 700
#define BNB_CC_VOLTA_XAVIER 720
#define BNB_CC_TURING 750
#define BNB_CC_AMPERE 800
#define BNB_CC_AMPERE2 860
#define BNB_CC_AMPERE2_ORIN 870
#define BNB_CC_ADA 890
#define BNB_CC_HOPPER 900
#define BNB_CC_BLACKWELL 1000
19

20
21
22
23
24
#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
#define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
#define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
#define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)
#define BNB_FP8_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_ADA)
25

26
#define BNB_WARP_SIZE warpSize
27
28
29
30

// The maximum number of resident threads per SM varies by arch.
// For A100/H100 and all prior to Turing, it is 2048, which allows
// for 2 full blocks of 1024 threads per SM.
31
32
// Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
33
#if __CUDA_ARCH__ == 750
34
#define BNB_MAX_THREADS_PER_SM 1024
35
#elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890
36
#define BNB_MAX_THREADS_PER_SM 1536
37
#else
38
#define BNB_MAX_THREADS_PER_SM 2048
39
40
41
#endif

// Maximum resident warps per SM is always directly related to the number of threads.
42
#define BNB_MAX_WARPS_PER_SM ((BNB_MAX_THREADS_PER_SM) / (BNB_WARP_SIZE))
43
44
45

// Maximum resident blocks per SM may vary.
#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
46
#define BNB_MAX_BLOCKS_PER_SM 16
47
#else
48
#define BNB_MAX_BLOCKS_PER_SM ((BNB_MAX_WARPS_PER_SM) / 2)
49
#endif