configs.cuh 2 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
2
#pragma once

lijian6's avatar
lijian6 committed
3
4
5
6
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp8.h>
#include <hip/hip_runtime.h>

Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
#define NUM_MAX_NVL_PEERS 8
#define NUM_MAX_RDMA_PEERS 20
lijian6's avatar
lijian6 committed
9
#define NUM_MAX_FIFO_SLOTS 32768
Chenggang Zhao's avatar
Chenggang Zhao committed
10
11
12
13
14
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
#define NUM_MAX_LOCAL_EXPERTS 1024
#define NUM_BUFFER_ALIGNMENT_BYTES 128

#define FINISHED_SUM_TAG 1024
15

Chenggang Zhao's avatar
Chenggang Zhao committed
16
#define NUM_CPU_TIMEOUT_SECS 100
lijian6's avatar
lijian6 committed
17
18
19
20
21
#define NUM_TIMEOUT_CYCLES 200000000000ll // 200G cycles ~= 100s

#define NUM_WAIT_NANOSECONDS 500

#define NUM_WAIT_CYCLES_TIMES_64 16
Chenggang Zhao's avatar
Chenggang Zhao committed
22
23
24

#define LOW_LATENCY_SEND_PHASE 1
#define LOW_LATENCY_RECV_PHASE 2
25
#define NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL 3
26
#define QUANTIZATION_GROUPSIZE 128
Chenggang Zhao's avatar
Chenggang Zhao committed
27

lijian6's avatar
lijian6 committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#define DEFAULT_NUM_CU 20
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_SEND_TOKENS 6
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_RECV_TOKENS 256
#define DEFAULT_NUM_MAX_RDMA_CHUNKED_SEND_TOKENS 6
#define DEFAULT_NUM_MAX_RDMA_CHUNKED_RECV_TOKENS 256

static constexpr int32_t kWarpSize = 64;
// For ROCm equals to half the wave size or Nvidia warp size
static constexpr int32_t  kEmulatedWarpSize = kWarpSize / 2;
static constexpr uint64_t kFullWarpMask     = 0xffffffffffffffff;
static constexpr uint64_t kFirstHalfMask    = 0x00000000ffffffff;
static constexpr uint64_t kSecondHalfMask   = 0xffffffff00000000;

template <typename T> constexpr inline __host__ __device__ T DIVUP(const T &x, const T &y) {
    return (((x) + ((y) -1)) / (y));
}

template <typename T> inline __host__ __device__ T ALIGN(T a, T b) {
    return DIVUP<T>(a, b) * b;
}

Chenggang Zhao's avatar
Chenggang Zhao committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

// Remove Torch restrictions
#ifdef __CUDA_NO_HALF_CONVERSIONS__
#undef __CUDA_NO_HALF_CONVERSIONS__
#endif
#ifdef __CUDA_NO_HALF_OPERATORS__
#undef __CUDA_NO_HALF_OPERATORS__
#endif
#ifdef __CUDA_NO_HALF2_OPERATORS__
#undef __CUDA_NO_HALF2_OPERATORS__
#endif
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
#endif
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
#undef __CUDA_NO_BFLOAT162_OPERATORS__
#endif

lijian6's avatar
lijian6 committed
67
68
69
// Remove Torch restrictions for HIP
#ifdef __HIP_NO_HALF_OPERATORS__
#undef __HIP_NO_HALF_OPERATORS__
70
#endif