ops.h 25.9 KB
Newer Older
1
2
#pragma once

3
#include <optional>
4
#include <torch/library.h>
5

6
7
#include "core/scalar_type.hpp"

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <vector>

torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
  // Ensure tensor is on CUDA
  if (!tensor.is_cuda()) {
    throw std::runtime_error("Tensor must be on CUDA device");
  }

  // Get the raw data pointer
  void* data_ptr = tensor.data_ptr();

  // Get tensor sizes and strides
  std::vector<int64_t> sizes = tensor.sizes().vec();
  std::vector<int64_t> strides = tensor.strides().vec();

  // Get tensor options (dtype, device)
  auto options = tensor.options();

  // Create a new tensor from the raw data pointer
  auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);

  return new_tensor;
}

32
void paged_attention_v1(
33
34
35
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
36
    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
37
38
39
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
    torch::Tensor& v_scale, const int64_t tp_rank,
    const int64_t blocksparse_local_blocks,
40
41
42
43
44
45
46
47
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
48
    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
49
50
51
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
    torch::Tensor& v_scale, const int64_t tp_rank,
    const int64_t blocksparse_local_blocks,
52
53
54
55
56
57
58
59
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

void paged_attention_v1_opt(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
60
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
61
62
63
64
65
66
67
68
69
70
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

void paged_attention_v2_opt(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
71
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
72
73
74
75
76
77
78
79
80
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

void paged_attention_v1_opt_tc(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
81
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
82
83
84
85
86
87
88
89
90
91
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

void paged_attention_v2_opt_tc(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
92
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
93
94
95
96
97
98
99
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);


// paged_attention with attn_masks
void paged_attention_v1_with_mask(
100
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
101
102
103
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
104
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
105
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
106
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
107
108
109
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
110

111
void paged_attention_v2_with_mask(
112
113
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
114
115
116
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
117
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
118
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
119
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
120
121
122
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
123

124
void paged_attention_v1_opt_with_mask(
zhuwenwen's avatar
zhuwenwen committed
125
126
127
128
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
129
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
zhuwenwen's avatar
zhuwenwen committed
130
131
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
132
133
134
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
zhuwenwen's avatar
zhuwenwen committed
135

136
void paged_attention_v2_opt_with_mask(
zhuwenwen's avatar
zhuwenwen committed
137
138
139
140
141
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
142
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
zhuwenwen's avatar
zhuwenwen committed
143
144
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
145
146
147
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
zhuwenwen's avatar
zhuwenwen committed
148

149
void paged_attention_v1_opt_tc_with_mask(
150
151
152
153
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
154
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
zhuwenwen's avatar
zhuwenwen committed
155
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
156
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
157
158
159
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
160

161
void paged_attention_v2_opt_tc_with_mask(
162
163
164
165
166
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
167
    const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale,
zhuwenwen's avatar
zhuwenwen committed
168
    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
169
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
170
171
172
    const int64_t blocksparse_head_sliding_step,
    const c10::optional<torch::Tensor>& attn_masks,
    const int64_t attn_masks_stride=0);
173

174
175
176
177
178
179
void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse);
180

zhuwenwen's avatar
zhuwenwen committed
181

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
void convert_vertical_slash_indexes(
    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
    torch::Tensor& column_count,     // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& column_index,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
    torch::Tensor q_seqlens,         // [BATCH, ]
    torch::Tensor kv_seqlens,        // [BATCH, ]
    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
    int64_t context_size, int64_t block_size_M, int64_t block_size_N,
    bool causal);

void convert_vertical_slash_indexes_mergehead(
    torch::Tensor& block_count,            // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& block_offset,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
    torch::Tensor& column_count,           // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& column_index,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
    torch::Tensor q_seqlens,               // [BATCH, ]
    torch::Tensor kv_seqlens,              // [BATCH, ]
    torch::Tensor vertical_indexes,        // [BATCH, N_HEADS, NNZ_V]
    torch::Tensor slash_indexes,           // [BATCH, N_HEADS, NNZ_S]
    torch::Tensor vertical_indices_count,  // [N_HEADS, ]
    torch::Tensor slash_indices_count, int64_t context_size,
    int64_t block_size_M, int64_t block_size_N, bool causal);
206

207
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
208
              double epsilon);
209
210

void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
211
                        torch::Tensor& weight, double epsilon);
212

zhuwenwen's avatar
zhuwenwen committed
213
214
215
216
217
218
void rms_norm_opt(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
              double epsilon);

void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

219
220
221
222
223
void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
                                 const torch::Tensor& repetition_penalties);

zhuwenwen's avatar
zhuwenwen committed
224
225
226
// void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
//                                torch::Tensor& weight, torch::Tensor& scale,
//                                double epsilon);
227

zhuwenwen's avatar
zhuwenwen committed
228
229
230
231
232
// void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
//                                          torch::Tensor& input,
//                                          torch::Tensor& residual,
//                                          torch::Tensor& weight,
//                                          torch::Tensor& scale, double epsilon);
233

234
235
236
237
238
239
240
void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
                                      torch::Tensor const& input,
                                      torch::Tensor const& weight,
                                      torch::Tensor& scales,
                                      double const epsilon,
                                      std::optional<torch::Tensor> scale_ub,
                                      std::optional<torch::Tensor> residual);
241

242
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
243
                      std::optional<torch::Tensor> key, int64_t head_size,
244
245
246
                      torch::Tensor& cos_sin_cache, bool is_neox);

void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
247
248
249
                              std::optional<torch::Tensor> key,
                              int64_t head_size, torch::Tensor& cos_sin_cache,
                              bool is_neox, int64_t rot_dim,
250
                              torch::Tensor& cos_sin_cache_offsets);
huangwb's avatar
huangwb committed
251
252
253
254
255
256
257
void rotary_embedding_tgi(
  torch::Tensor& query,
  torch::Tensor& key,
  int64_t head_size,
  torch::Tensor& cos_cache,
  torch::Tensor& sin_cache,
  bool is_neox);
258
259
260

void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

zhuwenwen's avatar
zhuwenwen committed
261
262
// void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
//                         torch::Tensor& scale);
263

264
265
void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

266
267
268
269
void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);

void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);

zhuwenwen's avatar
zhuwenwen committed
270
271
272
273
274
275
void silu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);

void gelu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);

void gelu_tanh_and_mul_opt(torch::Tensor& out, torch::Tensor& input);

276
277
278
void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                     double threshold);

279
280
281
void gelu_new(torch::Tensor& out, torch::Tensor& input);

void gelu_fast(torch::Tensor& out, torch::Tensor& input);
282

283
284
void gelu_quick(torch::Tensor& out, torch::Tensor& input);

zhuwenwen's avatar
zhuwenwen committed
285
286
void trans_w16_gemm(torch::Tensor dst, torch::Tensor src, int64_t row, int64_t col);

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
                            int64_t block_size, torch::Tensor& input_tokens,
                            torch::Tensor& sampled_token_ids,
                            torch::Tensor& input_positions,
                            torch::Tensor& seq_lens,
                            torch::Tensor& slot_mapping,
                            torch::Tensor& block_tables);

void advance_step_flashinfer(
    int64_t num_seqs, int64_t num_queries, int64_t block_size,
    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
    torch::Tensor& input_positions, torch::Tensor& seq_lens,
    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
302

zhuwenwen's avatar
zhuwenwen committed
303
304
305
306
307
// void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
//                         torch::Tensor const& q_pe,
//                         torch::Tensor const& kv_c_and_k_pe_cache,
//                         torch::Tensor const& seq_lens,
//                         torch::Tensor const& page_table, double scale);
308

309
310
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);

311
#ifndef USE_ROCM
312
313
314
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
                        const torch::Tensor& scales,
315
                        const std::vector<int64_t>& codebook_partition_sizes,
316
317
                        const std::optional<torch::Tensor>& bias);

318
319
320
torch::Tensor aqlm_dequant(
    const torch::Tensor& codes, const torch::Tensor& codebooks,
    const std::vector<int64_t>& codebook_partition_sizes);
321
322
323

torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
324
                       int64_t split_k_iters);
325
326
327

torch::Tensor awq_dequantize(torch::Tensor _kernel,
                             torch::Tensor _scaling_factors,
328
329
                             torch::Tensor _zeros, int64_t split_k_iters,
                             int64_t thx, int64_t thy);
330

331
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
332
#endif
333

334
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
335
336
                              int64_t n,
                              std::optional<at::ScalarType> const& dtype);
337

338
339
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                  int64_t type, int64_t row);
340

341
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
342
343
                              int64_t row);

344
345
346
347
348
349
torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
                          torch::Tensor sorted_token_ids,
                          torch::Tensor expert_ids,
                          torch::Tensor num_tokens_post_padded, int64_t type,
                          int64_t row, int64_t top_k, int64_t tokens);

350
351
352
353
torch::Tensor ggml_moe_a8_vec(torch::Tensor X, torch::Tensor W,
                              torch::Tensor topk_ids, int64_t top_k,
                              int64_t type, int64_t row, int64_t tokens);

354
355
int64_t ggml_moe_get_block_size(int64_t type);

356
#ifndef USE_ROCM
357
358

bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
359
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
360
bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
361
bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
362

363
364
365
366
367
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                           torch::Tensor const& B, torch::Tensor const& A_sf,
                           torch::Tensor const& B_sf,
                           torch::Tensor const& alpha);

368
369
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
370
                       torch::Tensor const& b_scales,
371
                       std::optional<torch::Tensor> const& bias);
372

373
374
375
376
377
void cutlass_moe_mm(
    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
378
379
    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
    bool per_act_token, bool per_out_ch);
380

381
382
383
384
385
void cutlass_fp4_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets);
386
387
388
389
390

void get_cutlass_moe_mm_data(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
391
392
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);
393

394
395
396
397
398
399
400
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                  torch::Tensor& problem_sizes1,
                                  torch::Tensor& problem_sizes2,
                                  const torch::Tensor& expert_num_tokens,
                                  const int64_t num_local_experts,
                                  const int64_t padded_m, const int64_t n,
                                  const int64_t k);
401

402
403
404
405
406
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                           torch::Tensor const& b,
                           torch::Tensor const& a_scales,
                           torch::Tensor const& b_scales,
                           torch::Tensor const& azp_adj,
407
408
                           std::optional<torch::Tensor> const& azp,
                           std::optional<torch::Tensor> const& bias);
409

410
411
bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);

412
413
414
415
void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                              torch::Tensor const& b, torch::Tensor const& e,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
416
                              std::optional<torch::Tensor> const& bias);
417

418
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
419
420
421
422

void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_scale,
                      torch::Tensor const& input_scale);
423
424
425
426
427
428

void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts);
429
#endif
430

431
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
432
                              torch::Tensor const& scale,
433
                              std::optional<torch::Tensor> const& azp);
434

435
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
436
                               torch::Tensor& scales,
437
                               std::optional<torch::Tensor> const& azp);
438

439
440
441
442
// torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
//                         torch::Tensor b_gptq_qzeros,
//                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
//                         bool use_exllama, int64_t bit);
443

444
// void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
445

446
447
// void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
//                              torch::Tensor const& scale);
448

449
// void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
zhuwenwen's avatar
zhuwenwen committed
450
//                               torch::Tensor& scale);
451

452
453
// void dynamic_per_token_scaled_fp8_quant(
//     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
zhuwenwen's avatar
zhuwenwen committed
454
//     std::optional<torch::Tensor> const& scale_ub);
455

456
457
458
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const torch::Tensor& A, const torch::Tensor& B,
                        const torch::Tensor& C,
459
460
461
                        const std::optional<torch::Tensor>& D_,
                        const std::optional<torch::Tensor>& z_,
                        const std::optional<torch::Tensor>& delta_bias_,
462
                        bool delta_softplus,
463
464
465
                        const std::optional<torch::Tensor>& query_start_loc,
                        const std::optional<torch::Tensor>& cache_indices,
                        const std::optional<torch::Tensor>& has_initial_state,
466
467
468
469
                        const torch::Tensor& ssm_states, int64_t pad_slot_id);

void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
                          const at::Tensor& weight,
470
                          const std::optional<at::Tensor>& bias_,
471
                          bool silu_activation,
472
473
                          const std::optional<at::Tensor>& cache_seqlens_,
                          const std::optional<at::Tensor>& conv_state_indices_,
474
475
476
                          int64_t pad_slot_id);

void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
477
478
479
480
481
                       const std::optional<at::Tensor>& bias_,
                       const std::optional<at::Tensor>& conv_states,
                       const std::optional<at::Tensor>& query_start_loc,
                       const std::optional<at::Tensor>& cache_indices,
                       const std::optional<at::Tensor>& has_initial_state,
482
                       bool silu_activation, int64_t pad_slot_id);
483

484
using fptr_t = int64_t;
485
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
486
487
                      torch::Tensor& rank_data, int64_t rank,
                      bool fully_connected);
488
489
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
490
491
492
493
494
495
496
497
498
499
500

void all_reduce_fuse_norm(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                          int64_t hidden_size, torch::Tensor& residual, torch::Tensor& rms_weight,
                          double eps, fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);

void all_reduce_fuse_norm_quant(fptr_t fa, torch::Tensor& inp, torch::Tensor& out, 
                                int64_t hidden_size,torch::Tensor& rms_weight, double eps, 
                                torch::Tensor& scales, torch::Tensor& norm_out,
                                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes,
                                std::optional<at::Tensor> residual, bool update_input);

501
void dispose(fptr_t _fa);
502
int64_t meta_size();
503
504
505
506
507
void register_buffer(fptr_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
std::tuple<std::vector<int64_t>, std::vector<int64_t>>
get_graph_buffer_ipc_meta(fptr_t _fa);
void register_graph_buffers(fptr_t _fa,
                            const std::vector<std::vector<int64_t>>& handles,
508
                            const std::vector<std::vector<int64_t>>& offsets);
zhuwenwen's avatar
zhuwenwen committed
509
510
std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
    int64_t size);
511
512
int64_t open_mem_handle(torch::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);
513
514
515
516
517
518
519
520
521
522
523

#ifdef USE_ROCM
fptr_t init_custom_qr(int64_t rank, int64_t world_size,
                      std::optional<int64_t> qr_max_size = std::nullopt);
void qr_destroy(fptr_t _fa);
torch::Tensor qr_get_handle(fptr_t _fa);
void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                   int64_t quant_level, bool cast_bf2half = false);
int64_t qr_max_size();
#endif