moe_cuda_kernel.h 1.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#ifndef MOE_CUDA_KERNEL_H
#define MOE_CUDA_KERNEL_H

#include <vector>
#include <torch/extension.h>
#include <torch/torch.h>

std::vector<torch::Tensor> moe_cuda_expert_count(
    torch::Tensor gate, size_t num_expert);

std::vector<torch::Tensor> moe_cuda_local_scatter(
    torch::Tensor input,
	torch::Tensor pos);

std::vector<torch::Tensor> moe_cuda_local_gather(
	torch::Tensor output_buf,
	torch::Tensor pos);

std::vector<torch::Tensor> moe_cuda_forward(
    torch::Tensor input_buf,
    torch::Tensor weight,
	torch::Tensor expert_count);

std::vector<torch::Tensor> moe_cuda_backward(
    torch::Tensor grad_output_buf,
    torch::Tensor input_buf,
    torch::Tensor weight,
	torch::Tensor expert_count);

#ifdef MOE_USE_NCCL

std::vector<torch::Tensor> moe_cuda_global_scatter(
    torch::Tensor input_buf,
	torch::Tensor local_expert_count,
	torch::Tensor global_expert_count,
	long batch_size, long n_workers);

std::vector<torch::Tensor> moe_cuda_global_gather(
	torch::Tensor output_buf,
	torch::Tensor local_expert_count,
	torch::Tensor global_expert_count,
	long batch_size, long n_workers);

Rick Ho's avatar
Rick Ho committed
44
45
46
47
std::vector<torch::Tensor> moe_cuda_expert_exchange(
	torch::Tensor local_expert_count,
	long num_expert, long n_workers);

48
49
50
#endif 

#endif  // MOE_CUDA_KERNEL_H