#pragma once

// TODO: Let's make some of these constexpr and put in a namespace.

#define BNB_CC_MAXWELL          500
#define BNB_CC_MAXWELL2         520
#define BNB_CC_MAXWELL2_X1      530
#define BNB_CC_PASCAL           600
#define BNB_CC_PASCAL_X2        620
#define BNB_CC_VOLTA            700
#define BNB_CC_VOLTA_XAVIER     720
#define BNB_CC_TURING           750
#define BNB_CC_AMPERE           800
#define BNB_CC_AMPERE2          860
#define BNB_CC_AMPERE2_ORIN     870
#define BNB_CC_ADA              890
#define BNB_CC_HOPPER           900
#define BNB_CC_BLACKWELL        1000

#define BNB_FP16_AVAILABLE      (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
#define BNB_FP16_MMA_AVAILABLE  (__CUDA_ARCH__ >= BNB_CC_VOLTA)
#define BNB_INT8_MMA_AVAILABLE  (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
#define BNB_BF16_AVAILABLE      (__CUDA_ARCH__ >= BNB_CC_AMPERE)
#define BNB_FP8_AVAILABLE       (__CUDA_ARCH__ >= BNB_CC_ADA)

#define BNB_WARP_SIZE   32

// The maximum number of resident threads per SM varies by arch.
// For A100/H100 and all prior to Turing, it is 2048, which allows
// for 2 full blocks of 1024 threads per SM.
// Reference: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
#if __CUDA_ARCH__ == 750
#define BNB_MAX_THREADS_PER_SM      1024
#elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890
#define BNB_MAX_THREADS_PER_SM      1536
#else
#define BNB_MAX_THREADS_PER_SM      2048
#endif

// Maximum resident warps per SM is always directly related to the number of threads.
#define BNB_MAX_WARPS_PER_SM        ((BNB_MAX_THREADS_PER_SM) / (BNB_WARP_SIZE))

// Maximum resident blocks per SM may vary.
#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
#define BNB_MAX_BLOCKS_PER_SM       16
#else
#define BNB_MAX_BLOCKS_PER_SM       ((BNB_MAX_WARPS_PER_SM) / 2)
#endif