#pragma once #include #include #include #define NUM_MAX_NVL_PEERS 8 #define NUM_MAX_RDMA_PEERS 20 #define NUM_MAX_FIFO_SLOTS 32768 #define NUM_WORKSPACE_BYTES (32 * 1024 * 1024) #define NUM_MAX_LOCAL_EXPERTS 1024 #define NUM_BUFFER_ALIGNMENT_BYTES 128 #define FINISHED_SUM_TAG 1024 #define NUM_CPU_TIMEOUT_SECS 100 #define NUM_TIMEOUT_CYCLES 200000000000ll // 200G cycles ~= 100s #define NUM_WAIT_NANOSECONDS 500 #define NUM_WAIT_CYCLES_TIMES_64 16 #define LOW_LATENCY_SEND_PHASE 1 #define LOW_LATENCY_RECV_PHASE 2 #define NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL 3 #define FP8_QUANTIZATION_NUM_PER_CHANNEL 128 #define DEFAULT_NUM_CU 20 #define DEFAULT_NUM_MAX_XGMI_CHUNKED_SEND_TOKENS 6 #define DEFAULT_NUM_MAX_XGMI_CHUNKED_RECV_TOKENS 256 #define DEFAULT_NUM_MAX_RDMA_CHUNKED_SEND_TOKENS 6 #define DEFAULT_NUM_MAX_RDMA_CHUNKED_RECV_TOKENS 256 static constexpr int32_t kWarpSize = 64; // For ROCm equals to half the wave size or Nvidia warp size static constexpr int32_t kEmulatedWarpSize = kWarpSize / 2; static constexpr uint64_t kFullWarpMask = 0xffffffffffffffff; static constexpr uint64_t kFirstHalfMask = 0x00000000ffffffff; static constexpr uint64_t kSecondHalfMask = 0xffffffff00000000; template constexpr inline __host__ __device__ T DIVUP(const T &x, const T &y) { return (((x) + ((y) -1)) / (y)); } template inline __host__ __device__ T ALIGN(T a, T b) { return DIVUP(a, b) * b; } // Remove Torch restrictions #ifdef __CUDA_NO_HALF_CONVERSIONS__ #undef __CUDA_NO_HALF_CONVERSIONS__ #endif #ifdef __CUDA_NO_HALF_OPERATORS__ #undef __CUDA_NO_HALF_OPERATORS__ #endif #ifdef __CUDA_NO_HALF2_OPERATORS__ #undef __CUDA_NO_HALF2_OPERATORS__ #endif #ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__ #undef __CUDA_NO_BFLOAT16_CONVERSIONS__ #endif #ifdef __CUDA_NO_BFLOAT162_OPERATORS__ #undef __CUDA_NO_BFLOAT162_OPERATORS__ #endif // Remove Torch restrictions for HIP #ifdef __HIP_NO_HALF_OPERATORS__ #undef __HIP_NO_HALF_OPERATORS__ #endif