moe_ops.h 1.65 KB
Newer Older
1
2
#pragma once

3
#include <torch/all.h>
4

5
6
7
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
                  torch::Tensor& gating_output);
8
9

void moe_sum(torch::Tensor& input, torch::Tensor& output);
SAC_fanth's avatar
SAC_fanth committed
10
void moe_sum_opt1(torch::Tensor& input, torch::Tensor& output);
11
12
13
14
15

void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
zhuwenwen's avatar
zhuwenwen committed
16
#ifndef USE_ROCM
17
18
19
20
21
22
23
24
25
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
                             std::optional<torch::Tensor> b_qzeros,
                             std::optional<torch::Tensor> topk_weights,
                             torch::Tensor sorted_token_ids,
                             torch::Tensor expert_ids,
                             torch::Tensor num_tokens_post_pad, int64_t top_k,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit);
王敏's avatar
王敏 committed
26
27
#endif

28
29
30
31
bool moe_permute_unpermute_supported();

void shuffle_rows(const torch::Tensor& input_tensor,
                  const torch::Tensor& dst2src_map,
zhuwenwen's avatar
zhuwenwen committed
32
33
                  torch::Tensor& output_tensor);

王敏's avatar
王敏 committed
34
35
36
37
38
39
40
std::vector<torch::Tensor> moe_fused_gate(
    torch::Tensor& input,
    torch::Tensor& bias,
    int64_t num_expert_group,
    int64_t topk_group,
    int64_t topk,
    int64_t n_share_experts_fusion,
zhuwenwen's avatar
zhuwenwen committed
41
    double routed_scaling_factor);