Commit bb92d30e authored by Rick Ho's avatar Rick Ho
Browse files

limit by threshold cuda code

parent 450549a4
......@@ -2,6 +2,7 @@
#include <vector>
#include <torch/extension.h>
// global_exchange
#ifdef FMOE_USE_NCCL
#include <c10d/ProcessGroupNCCL.hpp>
std::vector<torch::Tensor> _expert_exchange(
......@@ -20,6 +21,7 @@ std::vector<torch::Tensor> _global_gather(
void _ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t);
#endif // FMOE_USE_NCCL
// local_exchange
std::vector<torch::Tensor> _expert_count(
torch::Tensor gate,
size_t num_expert);
......@@ -30,16 +32,22 @@ std::vector<torch::Tensor> _local_gather(
torch::Tensor output_buf,
torch::Tensor pos);
// parallel_linear
std::vector<torch::Tensor> _linear_forward(
torch::Tensor input_buf,
torch::Tensor weight,
torch::Tensor expert_count);
std::vector<torch::Tensor> _linear_backward(
torch::Tensor grad_output_buf, // [batch_size x out_feat]
torch::Tensor input_buf, // [batch_size x out_feat]
torch::Tensor weight, // [num_expert x out_feat x in_feat]
torch::Tensor grad_output_buf,
torch::Tensor input_buf,
torch::Tensor weight,
torch::Tensor expert_count);
// balancing
std::vector<torch::Tensor> _limit_by_capacity(
torch::Tensor expert_count, torch::Tensor capacity,
long n_expert, long n_experts) {
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
#ifdef FMOE_USE_NCCL
m.def("expert_exchange", &_expert_exchange, "FastMoE expert exchange (CUDA)");
......@@ -54,4 +62,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("linear_forward", &_linear_forward, "FastMoE forward (CUDA)");
m.def("linear_backward", &_linear_backward, "FastMoE backward (CUDA)");
m.def("limit_by_capacity", &_limit_by_capacity, "FastMoE limit experts by capacity(CUDA)");
}
default : test_balancing
test_% : %.cu
nvcc $< ../stream_manager.cpp -lcublas -o $@
#include "../balancing.cuh"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda.h>
#include <cuda_runtime.h>
int main(int argc, char* args[]) {
int n_worker = atoi(args[1]);
int n_expert = atoi(args[2]);
int cap_v = atoi(args[3]);
int tot_expert = n_worker * n_expert;
long* lec = new long[tot_expert];
for (int i = 0; i < tot_expert; ++i) {
lec[i] = i;
}
long* g_lec;
cudaMalloc(&g_lec, sizeof(long) * tot_expert);
cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
int* cap = new int[n_expert];
for (int i = 0; i < n_expert; ++i) {
cap[i] = cap_v;
}
int* g_cap;
cudaMalloc(&g_cap, sizeof(int) * n_expert);
cudaMemcpy(g_cap, cap, sizeof(int) * n_expert, cudaMemcpyHostToDevice);
long* eca = new long[tot_expert];
long* g_eca;
cudaMalloc(&g_eca, sizeof(long) * tot_expert);
auto smgr = getCudaStreamManager(0);
fmoe_cuda_limit_by_capacity_impl(g_lec, g_cap, g_eca, n_expert, n_worker, smgr);
cudaMemcpy(cap, g_cap, sizeof(int) * n_expert, cudaMemcpyDeviceToHost);
cudaMemcpy(eca, g_eca, sizeof(long) * tot_expert, cudaMemcpyDeviceToHost);
printf("%d\n", cap[0]);
for (int i = 0; i < tot_expert; ++i) {
printf("%ld %ld\n", lec[i], eca[i]);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment