api.cuh 8.57 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
2
3
4
#pragma once

#include <vector>

5
6
#include "configs.cuh"

Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
9
10
11
namespace deep_ep {

// Intranode runtime
namespace intranode {

12
void barrier(int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
Chenggang Zhao's avatar
Chenggang Zhao committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

} // namespace intranode

// Internode runtime
namespace internode {

std::vector<uint8_t> get_unique_id();

int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks, bool low_latency_mode);

void *alloc(size_t size, size_t alignment);

void free(void *ptr);

void barrier();

void finalize();

} // namespace internode

33
34
35
// Layout kernels
namespace layout {

36
void get_dispatch_layout(const topk_idx_t* topk_idx,
37
38
39
40
41
42
43
                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
                         int* num_tokens_per_expert, bool* is_token_in_rank,
                         int num_tokens, int num_topk, int num_ranks, int num_experts,
                         cudaStream_t stream);

} // namespace layout

Chenggang Zhao's avatar
Chenggang Zhao committed
44
45
46
47
48
49
50
// Intranode kernels
namespace intranode {

void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                     int num_tokens, const bool* is_token_in_rank, int* channel_prefix_matrix,
                     int* rank_prefix_matrix_copy, int num_memset_int, int expert_alignment,
51
                     void** buffer_ptrs, int** barrier_signal_ptrs, int rank,
Chenggang Zhao's avatar
Chenggang Zhao committed
52
53
54
                     cudaStream_t stream, int num_sms);

void cached_notify_dispatch(const int* rank_prefix_matrix, int num_memset_int,
55
                            void** buffer_ptrs, int** barrier_signal_ptrs, int rank, int num_ranks,
Chenggang Zhao's avatar
Chenggang Zhao committed
56
57
                            cudaStream_t stream);

58
59
void dispatch(void* recv_x, float* recv_x_scales, int* recv_src_idx, topk_idx_t* recv_topk_idx, float* recv_topk_weights, int* recv_channel_offset,
              int* send_head, const void* x, const float* x_scales, const topk_idx_t* topk_idx, const float* topk_weights,
Chenggang Zhao's avatar
Chenggang Zhao committed
60
              const bool* is_token_in_rank, const int* channel_prefix_matrix,
61
              int num_tokens, int num_worst_tokens, int hidden_int4, int num_topk, int num_experts, int num_scales,
Shifang Xu's avatar
Shifang Xu committed
62
              int scale_token_stride, int scale_hidden_stride,
Chenggang Zhao's avatar
Chenggang Zhao committed
63
64
65
66
67
              void** buffer_ptrs, int rank, int num_ranks,
              cudaStream_t stream, int num_sms,
              int num_max_send_tokens, int num_recv_buffer_tokens);

void cached_notify_combine(void** buffer_ptrs, int* send_head, int num_channels, int num_recv_tokens, int num_memset_int,
68
                           int** barrier_signal_ptrs, int rank, int num_ranks, cudaStream_t stream);
Chenggang Zhao's avatar
Chenggang Zhao committed
69
70
71
72

void combine(cudaDataType_t type,
             void* recv_x, float* recv_topk_weights,
             const void* x, const float* topk_weights,
Shangyan Zhou's avatar
Shangyan Zhou committed
73
             const void* bias_0, const void* bias_1,
Chenggang Zhao's avatar
Chenggang Zhao committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
             const int* src_idx, const int* rank_prefix_matrix, const int* channel_prefix_matrix,
             int* send_head, int num_tokens, int num_recv_tokens, int hidden, int num_topk,
             void** buffer_ptrs, int rank, int num_ranks,
             cudaStream_t stream, int num_sms,
             int num_max_send_tokens, int num_recv_buffer_tokens);

} // namespace intranode

// Internode kernels
namespace internode {

int get_source_meta_bytes();

void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
                     const bool* is_token_in_rank, int num_tokens, int num_channels,
                     int hidden_int4, int num_scales, int num_topk, int expert_alignment,
                     int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum,
                     int* gbl_channel_prefix_matrix, int* recv_gbl_rank_prefix_sum,
                     void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
                     void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
96
                     int** barrier_signal_ptrs, int rank,
Chenggang Zhao's avatar
Chenggang Zhao committed
97
98
99
                     cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes,
                     bool low_latency_mode);

100
101
void dispatch(void* recv_x, float* recv_x_scales, topk_idx_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta,
              const void* x, const float* x_scales, const topk_idx_t* topk_idx, const float* topk_weights,
Chenggang Zhao's avatar
Chenggang Zhao committed
102
103
104
105
106
              int* send_rdma_head, int* send_nvl_head,
              int* recv_rdma_channel_prefix_matrix, int* recv_gbl_channel_prefix_matrix,
              const int* rdma_channel_prefix_matrix, const int* recv_rdma_rank_prefix_sum,
              const int* gbl_channel_prefix_matrix, const int* recv_gbl_rank_prefix_sum,
              const bool* is_token_in_rank,
Shifang Xu's avatar
Shifang Xu committed
107
108
              int num_tokens, int hidden_int4, int num_scales, int num_topk, int num_experts,
              int scale_token_stride, int scale_hidden_stride,
Chenggang Zhao's avatar
Chenggang Zhao committed
109
110
111
112
113
114
115
116
117
118
              void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
              void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
              int rank, int num_ranks, bool is_cached_dispatch,
              cudaStream_t stream, int num_channels, bool low_latency_mode);

void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights,
                   int num_ranks, int num_channels, int num_combined_tokens, int* combined_rdma_head,
                   const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, int* combined_nvl_head,
                   void* rdma_buffer_ptr, int num_max_rdma_chunked_recv_tokens,
                   void** buffer_ptrs, int num_max_nvl_chunked_recv_tokens,
119
                   int** barrier_signal_ptrs, int rank, cudaStream_t stream,
Chenggang Zhao's avatar
Chenggang Zhao committed
120
121
122
123
124
125
126
                   int64_t num_rdma_bytes, int64_t num_nvl_bytes,
                   bool is_cached_dispatch, bool low_latency_mode);

void combine(cudaDataType_t type,
             void* combined_x, float* combined_topk_weights,
             const bool* is_combined_token_in_rank,
             const void* x, const float* topk_weights,
Shangyan Zhou's avatar
Shangyan Zhou committed
127
             const void* bias_0, const void* bias_1,
Chenggang Zhao's avatar
Chenggang Zhao committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
             const int* combined_rdma_head, const int* combined_nvl_head,
             const void* src_meta, const int* rdma_channel_prefix_matrix, const int* rdma_rank_prefix_sum, const int* gbl_channel_prefix_matrix,
             int num_tokens, int num_combined_tokens, int hidden, int num_topk,
             void* rdma_buffer_ptr, int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens,
             void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
             int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode);

} // namespace internode

// Internode low-latency kernels
namespace internode_ll {

void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
                              int* clean_1, int num_clean_int_1,
                              cudaStream_t stream);

Shifang Xu's avatar
Shifang Xu committed
144
void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
Chenggang Zhao's avatar
Chenggang Zhao committed
145
              int* packed_recv_src_info, int64_t* packed_recv_layout_range,
146
              int* packed_recv_count,
147
              int* cumulative_local_expert_recv_stats,
148
              int64_t* dispatch_wait_recv_cost_stats,
Chenggang Zhao's avatar
Chenggang Zhao committed
149
              void* rdma_recv_x, int* rdma_recv_count, void* rdma_x,
150
              const void* x, const topk_idx_t* topk_idx,
Chenggang Zhao's avatar
Chenggang Zhao committed
151
152
              int* next_clean, int num_next_clean_int,
              int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
Shifang Xu's avatar
Shifang Xu committed
153
154
              int num_topk, int num_experts, int rank, int num_ranks,
              bool use_fp8, bool round_scale, bool use_ue8m0,
155
156
              void* workspace, int num_device_sms,
              cudaStream_t stream, int phases);
Chenggang Zhao's avatar
Chenggang Zhao committed
157
158
159

void combine(void* combined_x,
             void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
160
             const void* x, const topk_idx_t* topk_idx, const float* topk_weights,
Chenggang Zhao's avatar
Chenggang Zhao committed
161
             const int* src_info, const int64_t* layout_range,
162
             int64_t* combine_wait_recv_cost_stats,
Chenggang Zhao's avatar
Chenggang Zhao committed
163
164
165
             int* next_clean, int num_next_clean_int,
             int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
             int num_topk, int num_experts, int rank, int num_ranks,
166
             bool use_logfmt,
167
168
             void* workspace, int num_device_sms,
             cudaStream_t stream, int phases, bool zero_copy);
Chenggang Zhao's avatar
Chenggang Zhao committed
169
170
171
172

} // namespace internode_ll

} // namespace deep_ep