debuging

d4dd2a6c · Jiezhong Qiu · 93291a7e · d4dd2a6c · d4dd2a6c
Commit d4dd2a6c authored Dec 29, 2020 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 62 additions and 11 deletions

pytorch/cuda/moe.py pytorch/cuda/moe.py +49 -8

pytorch/cuda/moe_cuda_kernel.cu pytorch/cuda/moe_cuda_kernel.cu +13 -3

No files found.
--- a/pytorch/cuda/moe.py
+++ b/pytorch/cuda/moe.py
@@ -6,7 +6,7 @@ import torch
 import moe_cuda

 torch.manual_seed(42)
-
+torch.cuda.manual_seed(42)

 class MOEFunction(Function):
    @staticmethod
@@ -27,29 +27,70 @@ class MOEFunction(Function):
 class MOELayer(nn.Module):
    def __init__(self, num_expert=32, in_feat=1024, out_feat=4096):
        super(MOELayer, self).__init__()
+        self.num_expert = num_expert
+        self.in_feat = in_feat
+        self.out_feat = out_feat
        self.weight = nn.Parameter(
            torch.Tensor(num_expert, out_feat, in_feat))
        self.reset_parameters()

    def reset_parameters(self):
-        pass
+        for i in range(self.num_expert):
+            linear = nn.Linear(in_features=self.in_feat, out_features=out_feat)
+            self.weight.data[i] = linear.weight.data

    def forward(self, input, gate):
        return MOEFunction.apply(input, gate, self.weight)


-batch_size = 64
-num_expert = 32
-in_feat = 512
-out_feat = 512
+class MOELayer_einsum(nn.Module):
+    def __init__(self, num_expert=32, in_feat=1024, out_feat=4096):
+        super(MOELayer_einsum, self).__init__()
+        self.num_expert = num_expert
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.weight = nn.Parameter(
+            torch.Tensor(num_expert, out_feat, in_feat))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for i in range(self.num_expert):
+            linear = nn.Linear(in_features=self.in_feat, out_features=out_feat)
+            self.weight.data[i] = linear.weight.data
+    
+    def forward(self, input, gate):
+        gate_long = gate.long()
+        #W = self.weight[gate_long] # [batch_size x out_feat x in_feat]
+        #x = torch.einsum('id,ihd->ih', (input, W)) # [batch_size x out_feat]
+        #return x
+        batch_size = input.size(0)
+        x = input.new_zeros((batch_size, self.out_feat))
+        for i in range(batch_size):
+            x[i] = self.weight[gate_long[i]] @ input[i]
+        return x
+
+batch_size = 2
+num_expert = 2
+in_feat = 2
+out_feat = 4

 moe = MOELayer(num_expert, in_feat, out_feat).cuda()
+moe_einsum = MOELayer_einsum(num_expert, in_feat, out_feat).cuda()
+moe_einsum.weight.data = moe.weight.data.clone()

 input = torch.rand(batch_size, in_feat).cuda()
 gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()

+print(input)
+print(gate)
 output = moe(input, gate)

+print(input)
+print(gate)
+output_einsum = moe_einsum(input, gate)
+
+print(output)
+print(output_einsum)

-y = output.mean()
-y.backward()
\ No newline at end of file
+#y = output.mean()
+#y.backward()
\ No newline at end of file
--- a/pytorch/cuda/moe_cuda_kernel.cu
+++ b/pytorch/cuda/moe_cuda_kernel.cu
@@ -151,7 +151,7 @@ void moe_cuda_forward_impl(
    checkCudaErrors(cublasSetStream(h->handle, *(h->streams)));

    // setup Aarray, Barray and Carray
-	std::vector<const scalar_t*> aptrs;
+	std::vector<const scalar_t*> aptrs, bptrs;
    std::vector<scalar_t*> cptrs;
 	
    const scalar_t **Aarray;
@@ -163,6 +163,7 @@ void moe_cuda_forward_impl(

 	for (size_t i=0; i<batch_size; ++i) {
        aptrs.push_back(input + in_feat * i);
+        bptrs.push_back(weight + out_feat * in_feat * i);
        cptrs.push_back(output + out_feat * i);
 	}
 	checkCudaErrors(cudaMemcpy(Aarray, aptrs.data(), batch_size * sizeof(const scalar_t*), cudaMemcpyHostToDevice));
@@ -173,14 +174,23 @@ void moe_cuda_forward_impl(
 	dim3 blockdim(256);
    generate_ptr_offset_kernel<<<griddim, blockdim, 0, *(h->streams)>>>(batch_size, weight, out_feat * in_feat, gate, Barray);

+    const scalar_t **B = (const scalar_t **)malloc(batch_size * sizeof(const scalar_t*));
+    checkCudaErrors(cudaMemcpy(B, Barray, batch_size * sizeof(const scalar_t*), cudaMemcpyDeviceToHost));
+    
+    std::cout << weight << std::endl;
+    for (size_t i=0; i<batch_size; ++i) {
+        std::cout << B[i] << " " << bptrs[i] << std::endl;
+    }
+
    scalar_t alpha = 1, beta = 0;
+    
 	checkCudaErrors(cublasXgemmBatched(h->handle, 
 			CUBLAS_OP_N,
 			transb,
 			1, out_feat, in_feat,
 			&alpha,
 			Aarray, 1,
-			Barray, out_feat,
+			Barray, (transb == CUBLAS_OP_T) ? out_feat : in_feat,
 			&beta,
 			Carray, 1,
 			batch_size));
@@ -234,7 +244,7 @@ std::vector<torch::Tensor> moe_cuda_forward(
    const auto out_feat = weight.size(1);
    const auto in_feat = weight.size(2);
            
-    // printf("b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld, topk=%ld\n", batch_size, num_expert, in_feat, out_feat, top_k);
+    printf("b=%ld, expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld\n", batch_size, num_expert, in_feat, out_feat);
    auto output = input.new_zeros({batch_size, out_feat});
    
    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_forward_cuda", ([&] {