limit by threshold cuda code

bb92d30e · Rick Ho · 450549a4 · bb92d30e · bb92d30e · bb92d30e
Commit bb92d30e authored Apr 27, 2021 by Rick Ho
Showing with 62 additions and 3 deletions

cuda/fmoe_cuda.cpp cuda/fmoe_cuda.cpp +13 -3

cuda/tests/.gitignore cuda/tests/.gitignore +1 -0

cuda/tests/Makefile cuda/tests/Makefile +4 -0

cuda/tests/balancing.cu cuda/tests/balancing.cu +44 -0

No files found.
--- a/cuda/fmoe_cuda.cpp
+++ b/cuda/fmoe_cuda.cpp
@@ -2,6 +2,7 @@
 #include <vector>
 #include <torch/extension.h>

+// global_exchange
 #ifdef FMOE_USE_NCCL
 #include <c10d/ProcessGroupNCCL.hpp>
 std::vector<torch::Tensor> _expert_exchange(
@@ -20,6 +21,7 @@ std::vector<torch::Tensor> _global_gather(
 void _ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t);
 #endif  // FMOE_USE_NCCL

+// local_exchange
 std::vector<torch::Tensor> _expert_count(
        torch::Tensor gate, 
        size_t num_expert);
@@ -30,16 +32,22 @@ std::vector<torch::Tensor> _local_gather(
    torch::Tensor output_buf,
    torch::Tensor pos);

+// parallel_linear
 std::vector<torch::Tensor> _linear_forward(
        torch::Tensor input_buf,
        torch::Tensor weight,
        torch::Tensor expert_count);
 std::vector<torch::Tensor> _linear_backward(
-    torch::Tensor grad_output_buf, // [batch_size x out_feat]
-    torch::Tensor input_buf, // [batch_size x out_feat]
-    torch::Tensor weight, // [num_expert x out_feat x in_feat]
+    torch::Tensor grad_output_buf,
+    torch::Tensor input_buf,
+    torch::Tensor weight, 
    torch::Tensor expert_count);

+// balancing
+std::vector<torch::Tensor> _limit_by_capacity(
+		torch::Tensor expert_count, torch::Tensor capacity,
+		long n_expert, long n_experts) {
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #ifdef FMOE_USE_NCCL
    m.def("expert_exchange", &_expert_exchange, "FastMoE expert exchange (CUDA)");
@@ -54,4 +62,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

    m.def("linear_forward", &_linear_forward, "FastMoE forward (CUDA)");
    m.def("linear_backward", &_linear_backward, "FastMoE backward (CUDA)");
+
+	m.def("limit_by_capacity", &_limit_by_capacity, "FastMoE limit experts by capacity(CUDA)");
 }
--- a/cuda/tests/.gitignore
+++ b/cuda/tests/.gitignore
+test_*
--- a/cuda/tests/Makefile
+++ b/cuda/tests/Makefile
+default : test_balancing
+
+test_% : %.cu
+	nvcc $< ../stream_manager.cpp -lcublas -o $@
--- a/cuda/tests/balancing.cu
+++ b/cuda/tests/balancing.cu
+#include "../balancing.cuh"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+int main(int argc, char* args[]) {
+	int n_worker = atoi(args[1]);
+	int n_expert = atoi(args[2]);
+	int cap_v = atoi(args[3]);
+	int tot_expert = n_worker * n_expert;
+
+	long* lec = new long[tot_expert];
+	for (int i = 0; i < tot_expert; ++i) {
+		lec[i] = i;
+	}
+	long* g_lec;
+	cudaMalloc(&g_lec, sizeof(long) * tot_expert);
+	cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
+
+	int* cap = new int[n_expert];
+	for (int i = 0; i < n_expert; ++i) {
+		cap[i] = cap_v;
+	}
+	int* g_cap;
+	cudaMalloc(&g_cap, sizeof(int) * n_expert);
+	cudaMemcpy(g_cap, cap, sizeof(int) * n_expert, cudaMemcpyHostToDevice);
+
+	long* eca = new long[tot_expert];
+	long* g_eca;
+	cudaMalloc(&g_eca, sizeof(long) * tot_expert);
+
+	auto smgr = getCudaStreamManager(0);
+	fmoe_cuda_limit_by_capacity_impl(g_lec, g_cap, g_eca, n_expert, n_worker, smgr);
+
+	cudaMemcpy(cap, g_cap, sizeof(int) * n_expert, cudaMemcpyDeviceToHost);
+	cudaMemcpy(eca, g_eca, sizeof(long) * tot_expert, cudaMemcpyDeviceToHost);
+
+	printf("%d\n", cap[0]);
+	for (int i = 0; i < tot_expert; ++i) {
+		printf("%ld %ld\n", lec[i], eca[i]);
+	}
+}