update

7eb40a4a · Jiezhong Qiu · efa510bb · 7eb40a4a · 7eb40a4a
Commit 7eb40a4a authored Dec 15, 2020 by Jiezhong Qiu
Show whitespace changes
Inline Side-by-side

Showing with 37 additions and 23 deletions

pytorch/cuda/CMakeLists.txt pytorch/cuda/CMakeLists.txt +3 -2

pytorch/cuda/moe.cpp pytorch/cuda/moe.cpp +34 -21

No files found.
--- a/pytorch/cuda/CMakeLists.txt
+++ b/pytorch/cuda/CMakeLists.txt
@@ -5,7 +5,8 @@ find_package(Torch REQUIRED)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 include_directories("/home/jiezhong/anaconda3/envs/torch/include/python3.6m"
-                   "/usr/local/cuda/include")
+                   "/usr/local/cuda-/include"
+                   "/usr/local/cuda/samples/common/inc")
 add_executable(moe moe.cpp)
 target_link_libraries(moe 
                    "${TORCH_LIBRARIES}")

--- a/pytorch/cuda/moe.cpp
+++ b/pytorch/cuda/moe.cpp
@@ -11,10 +11,10 @@
 // CUDA and CUBLAS functions                                                                                              
 //#include <helper_functions.h>                                                                                             
-//#include <helper_cuda.h> 
+#include <helper_cuda.h> 
-const int num_stream=1024;
+const int num_stream=16;
 // std::vector<torch::Tensor> 
 void moe_cuda_forward(
@@ -28,48 +28,60 @@ void moe_cuda_forward(
    const auto d_model = weight.size(1);
    const auto d_ffn = weight.size(2);
    auto output = input.new_zeros({batch_size, num_expert, d_ffn});
+    std::cout << output << std::endl;
    cublasHandle_t handle;
-    cublasCreate(&handle);
+    checkCudaErrors(cublasCreate(&handle));
    cudaStream_t stream[num_stream];
    for (size_t i=0; i<num_stream; ++i) {
-        cudaStreamCreate(&stream[i]);
+        checkCudaErrors(cudaStreamCreate(&stream[i]));
    }
    size_t s;
    for (size_t i=0; i<batch_size; ++i) {
        for (size_t j=0; j<num_expert; ++j) {
            s = (i * num_expert + j) % num_stream;
            printf("i=%d j=%d goes to stream %d\n", i, j, s);
            cublasSetStream(handle, stream[s]);
-            if (input.scalar_type() == torch::ScalarType::Double) {
+            if (input.scalar_type() == torch::ScalarType::Float) {
-                double alpha = 1.0;
+                float alpha = 1.0;
-                double beta = 0.0;
+                float beta = 0.0;
-                cublasDgemm(handle, 
+                std::cout << input[i] << std::endl;
+                std::cout << weight.index(gate[i][j]) << std::endl;
+                std::cout << output[i][j] << std::endl;
+                cublasSgemm(handle, 
                    CUBLAS_OP_N, 
                    CUBLAS_OP_N,
-                    1,
+                    1, // m
-                    d_ffn,
+                    d_ffn, // n
-                    d_model,
+                    d_model, // k
                    &alpha,
-                    input[i].data_ptr<double>(),
+                    input.data_ptr<float>() + i * d_model,
+                    // input[i].data_ptr<float>(),
                    1,
-                    weight.index(gate[i][j]).data_ptr<double>(),
+                    weight.index(gate[i][j]).data_ptr<float>(),
                    d_model,
                    &beta,
-                    output[i][j].data_ptr<double>(),
+                    output.data_ptr<float>() + i * num_expert * d_ffn + j * d_ffn,
                    1);
            } else {
-                printf("only support double!!!\n");
+                printf("only support float!!!\n");
            }
        }
    }
+    cudaDeviceSynchronize();
+    printf("synchronized\n");
    for (size_t i=0; i<num_stream; ++i) {
        cudaStreamDestroy(stream[i]);
    }
+    std::cout << output << std::endl;
    cublasDestroy(handle);
 }
@@ -83,10 +95,11 @@ void moe_cuda_forward(
 int main() {
-    torch::Tensor input = torch::randn({2, 4}, torch::dtype(torch::kFloat64).device(torch::kCUDA, 3));
+    int device=2;
-    torch::Tensor gate = torch::ones({2, 1}, torch::dtype(torch::kInt64).device(torch::kCUDA, 3));
+    torch::Tensor input = torch::randn({2, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
-    torch::Tensor weight = torch::randn({2, 4, 4}, torch::dtype(torch::kFloat64).device(torch::kCUDA, 3));
+    torch::Tensor gate = torch::zeros({2, 1}, torch::dtype(torch::kInt64).device(torch::kCUDA, device));
-    torch::Tensor bias = torch::randn({2, 4}, torch::dtype(torch::kFloat64).device(torch::kCUDA, 3));
+    torch::Tensor weight = torch::randn({2, 4, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
+    torch::Tensor bias = torch::randn({2, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
    std::cout << input << std::endl;
    moe_cuda_forward(input, gate, weight, bias);
 }
\ No newline at end of file