Fix cublasLt context create/destroy overhead in MLP extension (#1083)

* don't create cublasLt handle, fix zero block size case * cleanup

Fix cublasLt context create/destroy overhead in MLP extension (#1083)
* don't create cublasLt handle, fix zero block size case * cleanup
082f999a · Burc Eryilmaz · GitHub · b8be1bc7 · 082f999a
Unverified Commit 082f999a authored Apr 19, 2021 by Burc Eryilmaz Committed by GitHub Apr 19, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 29 deletions

csrc/mlp_cuda.cu csrc/mlp_cuda.cu +24 -29

No files found.
--- a/csrc/mlp_cuda.cu
+++ b/csrc/mlp_cuda.cu
@@ -718,7 +718,7 @@ void get_biasAddRelu_bprop_grid_size(
  // Get number of SMs for efficient reduction.
  int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  // can switch to occupancy calculation. use 4 below now for sm_70
-  int max_blocks_y = num_SMs * 4 / (*grid_x);
+  int max_blocks_y = (num_SMs * 4+(*grid_x)-1) / (*grid_x);
  // block_y should be from minimal work per thread
  int nRedSplits = (batch_size + block_y - 1) / block_y;
  // increase number of elem per thread redcution to not launch more than enough
@@ -1252,9 +1252,6 @@ int mlp_fp(
  // Get cublas handle from Pytorch
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  cublasLtHandle_t ltHandle;
-  cublasStatus_t lthandle_status;
-  lthandle_status = cublasLtCreate(&ltHandle);
  // Get the stream from cublas handle to reuse for biasReLU kernel.
  cudaStream_t stream;
  cublasGetStream(handle, &stream);
@@ -1274,28 +1271,29 @@ int mlp_fp(
    // try with cublaslt first for supported case with valid handle
    int cublaslt_status = 1;
-    if(lthandle_status == CUBLAS_STATUS_SUCCESS && activation < 2){
+    if(activation < 1){
-      cublaslt_status = mlp_gemm_lt(
+        cublaslt_status = mlp_gemm_lt(
-        ltHandle,
+          //ltHandle,
-        CUBLAS_OP_T,
+          (cublasLtHandle_t)handle,
-        CUBLAS_OP_N,
+          CUBLAS_OP_T,
-        ofeat,
+          CUBLAS_OP_N,
-        batch_size,
+          ofeat,
-        ifeat,
+          batch_size,
-        &one,
+          ifeat,
-        weight,
+          &one,
-        ifeat,
+          weight,
-        input,
+          ifeat,
-        ifeat,
+          input,
-        &zero,
+          ifeat,
-        output,
+          &zero,
-        ofeat,
+          output,
-        lt_workspace,
+          ofeat,
-        1 << 22,
+          lt_workspace,
-        stream,
+          1 << 22,
-        use_bias == 1,
+          stream,
-        activation == 1,
+          use_bias == 1,
-        bias);
+          activation == 1,
+          bias);
    }
    // if cublaslt failed or not executed, fallback to cublas
@@ -1357,9 +1355,6 @@ int mlp_fp(
    reserved_space_y += ofeat * batch_size;
  }
-  if(lthandle_status == CUBLAS_STATUS_SUCCESS) cublasLtDestroy(ltHandle);
  return 0;
 }