torch_bindings.cpp 5.18 KB
Newer Older
1
#include "core/registration.h"
2
3
4
5
6
7
#include "moe_ops.h"

TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
8
      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
9
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
10

11
12
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
bnellnm's avatar
bnellnm committed
13
  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
14
15
16
17
18
19
20
21
  m.impl("moe_sum", torch::kCUDA, &moe_sum);

  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size.
  m.def(
      "moe_align_block_size(Tensor topk_ids, int num_experts,"
      "                     int block_size, Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
22
23
      "                     Tensor! num_tokens_post_pad,"
      "                     Tensor? maybe_expert_map) -> ()");
24
25
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

26
27
28
29
30
31
32
33
34
35
36
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size, but for the batched case.
  m.def(
      "batched_moe_align_block_size(int max_tokens_per_batch,"
      "                     int block_size, Tensor expert_num_tokens,"
      "                     Tensor! sorted_token_ids,"
      "                     Tensor! experts_ids,"
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("batched_moe_align_block_size", torch::kCUDA,
         &batched_moe_align_block_size);

37
38
39
40
41
42
43
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size.
  m.def(
      "moe_lora_align_block_size(Tensor topk_ids,"
      "                     Tensor token_lora_mapping,"
      "                     int num_experts,"
      "                     int block_size, int max_loras, "
44
45
      "                     int max_num_tokens_padded, "
      "                     int max_num_m_blocks, "
46
47
      "                     Tensor !sorted_token_ids,"
      "                     Tensor !experts_ids,"
48
49
      "                     Tensor !num_tokens_post_pad,"
      "                     Tensor !adapter_enabled,"
gnovack's avatar
gnovack committed
50
51
      "                     Tensor !lora_ids,"
      "                     Tensor? maybe_expert_map) -> () ");
52
53
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);

54
#ifndef USE_ROCM
55
56
57
58
59
60
61
62
63
64
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
      "Tensor b_scales, Tensor? b_qzeros, "
      "Tensor? topk_weights, Tensor sorted_token_ids, "
      "Tensor expert_ids, Tensor num_tokens_post_pad, "
      "int top_k, int BLOCK_SIZE_M, int BLOCK_SIZE_N, int BLOCK_SIZE_K, "
      "int bit) -> Tensor");

  m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);

65
  m.def(
66
      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
67
      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
68
      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
69
      "b_zeros_or_none,"
70
71
72
73
      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
      "Tensor sorted_token_ids,"
      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
      "Tensor! topk_weights, int moe_block_size, int top_k, "
74
      "bool mul_topk_weights, bool is_ep, int b_type_id,"
75
76
      "int size_m, int size_n, int size_k,"
      "bool is_full_k, bool use_atomic_add,"
77
78
79
      "bool use_fp32_reduce, bool is_zp_float,"
      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");

80
81
82
83
84
85
86
87
88
89
90
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
      "int b_q_type, SymInt size_m, "
      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
      "topk, "
      "int moe_block_size, bool replicate_input, bool apply_weights)"
      " -> Tensor");

  m.def(
91
      "moe_permute(Tensor input, Tensor topk_ids,"
92
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
93
94
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
95
96
      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
      "permuted_idx, Tensor! m_indices)->()");
97

98
99
  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
100
101
      "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
      "int topk, Tensor! hidden_states)->()");
102
103
104

  m.def("moe_permute_unpermute_supported() -> bool");
  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
105

106
107
108
109
110
111
  // Row shuffle for MoE
  m.def(
      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
      "output_tensor) -> ()");
  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);

112
113
  // Apply grouped topk routing to select experts.
  m.def(
114
      "grouped_topk(Tensor scores, int n_group, int "
115
      "topk_group, int topk, bool renormalize, float "
116
117
      "routed_scaling_factor, Tensor bias, int scoring_func) -> (Tensor, "
      "Tensor)");
118
  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
119
#endif
120
121
122
}

REGISTER_EXTENSION(TORCH_EXTENSION_NAME)