update

0ab605cc · Jiezhong Qiu · b83ac1a5 · 0ab605cc · 0ab605cc
Commit 0ab605cc authored Dec 29, 2020 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 36 deletions

pytorch/cuda/moe.py pytorch/cuda/moe.py +10 -14

pytorch/cuda/moe_cuda_kernel.cu pytorch/cuda/moe_cuda_kernel.cu +2 -22

No files found.
--- a/pytorch/cuda/moe.py
+++ b/pytorch/cuda/moe.py
@@ -11,8 +11,10 @@ torch.cuda.manual_seed(42)
 class MOEFunction(Function):
    @staticmethod
    def forward(ctx, inp, gate, weight):
-        output = moe_cuda.forward(inp, gate, weight)
-        variables = [inp, gate, weight]
+        out_feat, in_feat = weight.size()[1:]
+        weight_column_major = weight.transpose(-1, -2).contiguous().view(-1, out_feat, in_feat)
+        output = moe_cuda.forward(inp, gate, weight_column_major)
+        variables = [inp, gate, weight_column_major]
        ctx.save_for_backward(*variables)

        return output[0]
@@ -21,7 +23,9 @@ class MOEFunction(Function):
    def backward(ctx, grad_out):
        grad_inp, grad_weight = moe_cuda.backward(
            grad_out.contiguous(), *ctx.saved_tensors)
-        return grad_inp, None, grad_weight
+        out_feat, in_feat = grad_weight.size()[1:]
+        grad_weight_row_major = grad_weight.transpose(-1, -2).contiguous().view(-1, out_feat, in_feat)
+        return grad_inp, None, grad_weight_row_major


 class MOELayer(nn.Module):
@@ -66,9 +70,9 @@ class MOELayer_einsum(nn.Module):
            x[i] = self.weight[gate_long[i]] @ inp[i]
        return x

-batch_size = 1
-num_expert = 1
-in_feat = 3
+batch_size = 4
+num_expert = 4
+in_feat = 2
 out_feat = 3

 moe = MOELayer(num_expert, in_feat, out_feat).cuda()
@@ -79,15 +83,7 @@ moe_einsum.weight.data = moe.weight.data.clone()
 inp = torch.rand(batch_size, in_feat).cuda()
 gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()

-print(inp.type())
-print(moe.weight.data.type())
-
-print(inp)
-print(gate)
 output = moe(inp, gate)
-
-print(inp)
-print(gate)
 output_einsum = moe_einsum(inp.clone(), gate.clone())

 print(output)

--- a/pytorch/cuda/moe_cuda_kernel.cu
+++ b/pytorch/cuda/moe_cuda_kernel.cu
@@ -140,18 +140,13 @@ void moe_cuda_forward_impl(
        const size_t out_feat,
        const size_t num_expert,
        cublasOperation_t transb) {
-    /*
-    cublasHandle_t handle;
-	cudaStream_t st;
-	checkCudaErrors(cudaStreamCreate(&st));
-    checkCudaErrors(cublasCreate(&handle));
-    */
+
    Helper* h = getHelper(num_expert);

    checkCudaErrors(cublasSetStream(h->handle, *(h->streams)));

    // setup Aarray, Barray and Carray
-	std::vector<const scalar_t*> aptrs, bptrs;
+	std::vector<const scalar_t*> aptrs;
    std::vector<scalar_t*> cptrs;
 	
    const scalar_t **Aarray;
@@ -161,12 +156,8 @@ void moe_cuda_forward_impl(
    checkCudaErrors(cudaMalloc(&Barray, batch_size * sizeof(const scalar_t*)));
    checkCudaErrors(cudaMalloc(&Carray, batch_size * sizeof(scalar_t*)));

-    int* gate_host = new int[batch_size];
-    checkCudaErrors(cudaMemcpy(gate_host, gate, batch_size * sizeof(int), cudaMemcpyDeviceToHost));
-
 	for (size_t i=0; i<batch_size; ++i) {
        aptrs.push_back(input + in_feat * i);
-        bptrs.push_back(weight + out_feat * in_feat * gate_host[i]);
        cptrs.push_back(output + out_feat * i);
 	}
 	checkCudaErrors(cudaMemcpy(Aarray, aptrs.data(), batch_size * sizeof(const scalar_t*), cudaMemcpyHostToDevice));
@@ -177,17 +168,6 @@ void moe_cuda_forward_impl(
 	dim3 blockdim(256);
    generate_ptr_offset_kernel<<<griddim, blockdim, 0, *(h->streams)>>>(batch_size, weight, out_feat * in_feat, gate, Barray);

-    const scalar_t **B = (const scalar_t **)malloc(batch_size * sizeof(const scalar_t*));
-    checkCudaErrors(cudaMemcpy(B, Barray, batch_size * sizeof(const scalar_t*), cudaMemcpyDeviceToHost));
-    
-    std::cout << input << " " << weight << " " << output << std::endl;
-    for (size_t i=0; i<batch_size; ++i) {
-        std::cout << i << std::endl;
-        std::cout << "A " << aptrs[i] << std::endl;
-        std::cout << "B " << B[i] << " " << bptrs[i] << std::endl;
-        std::cout << "C " << cptrs[i] << std::endl;
-    }
-
    scalar_t alpha = 1, beta = 0;
 	checkCudaErrors(cublasXgemmBatched(h->handle, 
 			CUBLAS_OP_N,