launch.cuh 3.23 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
2
3
#pragma once

#include "configs.cuh"
4
#include "exception.cuh"
Chenggang Zhao's avatar
Chenggang Zhao committed
5
6

#ifndef SETUP_LAUNCH_CONFIG
7
#ifndef DISABLE_SM90_FEATURES
Chenggang Zhao's avatar
Chenggang Zhao committed
8
9
#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream) \
    cudaLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream, nullptr, 0}; \
Zhean Xu's avatar
Zhean Xu committed
10
    cudaLaunchAttribute attr[2]; \
Chenggang Zhao's avatar
Chenggang Zhao committed
11
12
    attr[0].id = cudaLaunchAttributeCooperative; \
    attr[0].val.cooperative = 1; \
Zhean Xu's avatar
Zhean Xu committed
13
14
15
16
    attr[1].id = cudaLaunchAttributeClusterDimension; \
    attr[1].val.clusterDim.x = (num_sms % 2 == 0 ? 2 : 1); \
    attr[1].val.clusterDim.y = 1; \
    attr[1].val.clusterDim.z = 1; \
Chenggang Zhao's avatar
Chenggang Zhao committed
17
    cfg.attrs = attr; \
Zhean Xu's avatar
Zhean Xu committed
18
    cfg.numAttrs = 2
19
20
21
22
23
24
#else
#define SETUP_LAUNCH_CONFIG(sms, threads, stream) \
    int __num_sms = (sms); \
    int __num_threads = (threads); \
    auto __stream = (stream)
#endif
Chenggang Zhao's avatar
Chenggang Zhao committed
25
26
27
#endif

#ifndef LAUNCH_KERNEL
28
#ifndef DISABLE_SM90_FEATURES
Chenggang Zhao's avatar
Chenggang Zhao committed
29
#define LAUNCH_KERNEL(config, kernel, ...) CUDA_CHECK(cudaLaunchKernelEx(config, kernel, ##__VA_ARGS__))
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#else
#define LAUNCH_KERNEL(config, kernel, ...) \
do { \
    kernel<<<__num_sms, __num_threads, 0, __stream>>>(__VA_ARGS__); \
    cudaError_t e = cudaGetLastError(); \
    if (e != cudaSuccess) { \
        EPException cuda_exception("CUDA", __FILE__, __LINE__, cudaGetErrorString(e)); \
        fprintf(stderr, "%s\n", cuda_exception.what()); \
        throw cuda_exception; \
    } \
} while (0)
#endif
#endif

#ifndef SET_SHARED_MEMORY_FOR_TMA
#ifndef DISABLE_SM90_FEATURES
#define SET_SHARED_MEMORY_FOR_TMA(kernel) \
EP_HOST_ASSERT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size) == cudaSuccess); \
cfg.dynamicSmemBytes = smem_size;
#else
#define SET_SHARED_MEMORY_FOR_TMA(kernel) void()
#endif
Chenggang Zhao's avatar
Chenggang Zhao committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#endif

#define SWITCH_RANKS(case_macro) \
    switch (num_ranks) { \
        case 2: case_macro(2); \
        case 4: case_macro(4); \
        case 8: case_macro(8); \
        default: EP_HOST_ASSERT(false and "Unsupported ranks"); \
    } while (false)

#define SWITCH_RDMA_RANKS(case_macro) \
    switch (num_ranks / NUM_MAX_NVL_PEERS) { \
        case 2: case_macro(2); \
        case 4: case_macro(4); \
66
        case 6: case_macro(6); \
Chenggang Zhao's avatar
Chenggang Zhao committed
67
        case 8: case_macro(8); \
68
        case 12: case_macro(12); \
Chenggang Zhao's avatar
Chenggang Zhao committed
69
        case 16: case_macro(16); \
Chenggang Zhao's avatar
Chenggang Zhao committed
70
71
        case 18: case_macro(18); \
        case 20: case_macro(20); \
Chenggang Zhao's avatar
Chenggang Zhao committed
72
73
74
75
76
77
78
79
        default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
    } while (false)

#define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro) \
    switch (num_ranks) { \
        case 2: case_macro(dtype, 2); \
        case 4: case_macro(dtype, 4); \
        case 8: case_macro(dtype, 8); \
Zhean Xu's avatar
Zhean Xu committed
80
        default: EP_HOST_ASSERT(false and "Unsupported ranks"); \
Chenggang Zhao's avatar
Chenggang Zhao committed
81
82
83
84
85
    } while (false)

#define SWITCH_TYPES(case_macro) \
    switch (type) { \
        case CUDA_R_16BF: case_macro(nv_bfloat16); \
Zhean Xu's avatar
Zhean Xu committed
86
        default: EP_HOST_ASSERT(false and "Unsupported type"); \
Chenggang Zhao's avatar
Chenggang Zhao committed
87
88
89
90
    } while (false)

#define SWITCH_HIDDEN(case_macro) \
    switch (hidden) { \
Chenggang Zhao's avatar
Chenggang Zhao committed
91
        case 2048: case_macro(2048); \
Chenggang Zhao's avatar
Chenggang Zhao committed
92
        case 2560: case_macro(2560); \
sleepcoo's avatar
sleepcoo committed
93
        case 4096: case_macro(4096); \
Chenggang Zhao's avatar
Chenggang Zhao committed
94
        case 5120: case_macro(5120); \
Jee Jee Li's avatar
Jee Jee Li committed
95
        case 6144: case_macro(6144); /* For qwen3 coder */ \
Chenggang Zhao's avatar
Chenggang Zhao committed
96
        case 7168: case_macro(7168); \
ruizhang1230's avatar
ruizhang1230 committed
97
        case 8192: case_macro(8192); \
Zhean Xu's avatar
Zhean Xu committed
98
        default: EP_HOST_ASSERT(false and "Unsupported hidden"); \
Chenggang Zhao's avatar
Chenggang Zhao committed
99
    } while (false)