configs.cuh 1.96 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
2
#pragma once

lijian6's avatar
lijian6 committed
3
4
5
6
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp8.h>
#include <hip/hip_runtime.h>

Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
#define NUM_MAX_NVL_PEERS 8
#define NUM_MAX_RDMA_PEERS 20
lijian6's avatar
lijian6 committed
9
#define NUM_MAX_FIFO_SLOTS 32768
Chenggang Zhao's avatar
Chenggang Zhao committed
10
11
12
13
14
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
#define NUM_MAX_LOCAL_EXPERTS 1024
#define NUM_BUFFER_ALIGNMENT_BYTES 128

#define FINISHED_SUM_TAG 1024
15

Chenggang Zhao's avatar
Chenggang Zhao committed
16
#define NUM_CPU_TIMEOUT_SECS 100
lijian6's avatar
lijian6 committed
17
18
19
20
21
#define NUM_TIMEOUT_CYCLES 200000000000ll // 200G cycles ~= 100s

#define NUM_WAIT_NANOSECONDS 500

#define NUM_WAIT_CYCLES_TIMES_64 16
Chenggang Zhao's avatar
Chenggang Zhao committed
22
23
24
25

#define LOW_LATENCY_SEND_PHASE 1
#define LOW_LATENCY_RECV_PHASE 2

lijian6's avatar
lijian6 committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#define NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL 3

#define DEFAULT_NUM_CU 20
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_SEND_TOKENS 6
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_RECV_TOKENS 256
#define DEFAULT_NUM_MAX_RDMA_CHUNKED_SEND_TOKENS 6
#define DEFAULT_NUM_MAX_RDMA_CHUNKED_RECV_TOKENS 256

static constexpr int32_t kWarpSize = 64;
// For ROCm equals to half the wave size or Nvidia warp size
static constexpr int32_t  kEmulatedWarpSize = kWarpSize / 2;
static constexpr uint64_t kFullWarpMask     = 0xffffffffffffffff;
static constexpr uint64_t kFirstHalfMask    = 0x00000000ffffffff;
static constexpr uint64_t kSecondHalfMask   = 0xffffffff00000000;

template <typename T> constexpr inline __host__ __device__ T DIVUP(const T &x, const T &y) {
    return (((x) + ((y) -1)) / (y));
}

template <typename T> inline __host__ __device__ T ALIGN(T a, T b) {
    return DIVUP<T>(a, b) * b;
}

Chenggang Zhao's avatar
Chenggang Zhao committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

// Remove Torch restrictions
#ifdef __CUDA_NO_HALF_CONVERSIONS__
#undef __CUDA_NO_HALF_CONVERSIONS__
#endif
#ifdef __CUDA_NO_HALF_OPERATORS__
#undef __CUDA_NO_HALF_OPERATORS__
#endif
#ifdef __CUDA_NO_HALF2_OPERATORS__
#undef __CUDA_NO_HALF2_OPERATORS__
#endif
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
#endif
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
#undef __CUDA_NO_BFLOAT162_OPERATORS__
#endif

lijian6's avatar
lijian6 committed
67
68
69
// Remove Torch restrictions for HIP
#ifdef __HIP_NO_HALF_OPERATORS__
#undef __HIP_NO_HALF_OPERATORS__
70
#endif