Enable mlp_cuda extension. (#28)

* enable mlp cuda * add setup changes and tests * skip the unit tests * updated conditions for empty array * removed hip platform conditions

Enable mlp_cuda extension. (#28)
* enable mlp cuda * add setup changes and tests * skip the unit tests * updated conditions for empty array * removed hip platform conditions
d2f6d04a · Chaitanya Sri Krishna Lolla · GitHub · 4116ed66 · d2f6d04a · d2f6d04a
Unverified Commit d2f6d04a authored Aug 05, 2020 by Chaitanya Sri Krishna Lolla Committed by GitHub Aug 05, 2020
5 changed files
--- a/csrc/mlp.cpp
+++ b/csrc/mlp.cpp
@@ -4,6 +4,14 @@
 #include <stdio.h>
+int SizeTToInt(size_t data)
+{
+    if (data > std::numeric_limits<int>::max()) {
+        throw std::runtime_error("Invalid cast.");
+    }
+    return static_cast<int>(data);
+}
 size_t get_mlp_reserved_space(int batch_size, int num_layers, const int* output_features);
 template <typename T>
@@ -62,7 +70,7 @@ std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at
  // create output/workspace tensor
  // TODO(deyuf): just get buffer?
  auto out = at::empty({batch_size, output_features.back()}, inputs[0].type());
-  auto reserved_space = at::empty({reserved_size}, inputs[0].type());
+  auto reserved_space = at::empty({SizeTToInt(reserved_size)}, inputs[0].type());
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] {
    std::vector<scalar_t*> w_ptr;
@@ -134,7 +142,7 @@ std::vector<at::Tensor> mlp_backward(
        get_mlp_bp_workspace_in_bytes<scalar_t>(batch_size, num_layers, output_features.data());
    // auto work_space = at::empty({work_size*4}, at::kByte);
-    auto work_space = at::empty({work_size / sizeof(scalar_t)}, inputs[0].type());
+    auto work_space = at::empty({SizeTToInt(work_size / sizeof(scalar_t))}, inputs[0].type());
    auto result = mlp_bp<scalar_t>(
        inputs[0].data_ptr<scalar_t>(),

--- a/csrc/mlp_cuda.cu
+++ b/csrc/mlp_cuda.cu
@@ -67,6 +67,33 @@ cublasStatus_t mlp_gemm(
    const float* beta,
    double* C,
    int ldc) {
+#ifdef __HIP_PLATFORM_HCC__
+  return rocblas_gemm_ex(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      rocblas_datatype_f64_r,
+      lda,
+      B,
+      rocblas_datatype_f64_r,
+      ldb,
+      beta,
+      C,
+      rocblas_datatype_f64_r,
+      ldc,
+      C,
+      rocblas_datatype_f64_r,
+      ldc,
+      rocblas_datatype_f64_r,
+      rocblas_gemm_algo_standard,
+      0,
+      0);  
+#else
  return cublasGemmEx(
      handle,
      transa,
@@ -87,6 +114,7 @@ cublasStatus_t mlp_gemm(
      ldc,
      CUDA_R_64F,
      CUBLAS_GEMM_DEFAULT);
+#endif
 }
 // FP32 Wrapper around cublas GEMMEx
@@ -105,6 +133,34 @@ cublasStatus_t mlp_gemm(
    const float* beta,
    float* C,
    int ldc) {
+#ifdef __HIP_PLATFORM_HCC__
+  return rocblas_gemm_ex(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      rocblas_datatype_f32_r,
+      lda,
+      B,
+      rocblas_datatype_f32_r,
+      ldb,
+      beta,
+      C,
+      rocblas_datatype_f32_r,
+      ldc,
+      C,
+      rocblas_datatype_f32_r,
+      ldc,
+      rocblas_datatype_f32_r,
+      rocblas_gemm_algo_standard,
+      0,
+      0);
+#else
  return cublasGemmEx(
      handle,
      transa,
@@ -125,6 +181,7 @@ cublasStatus_t mlp_gemm(
      ldc,
      CUDA_R_32F,
      CUBLAS_GEMM_DEFAULT);
+#endif
 }
 // FP16 Tensor core wrapper around cublas GEMMEx
@@ -143,6 +200,33 @@ cublasStatus_t mlp_gemm(
    float* beta,
    at::Half* C,
    int ldc) {
+#ifdef __HIP_PLATFORM_HCC__
+  return rocblas_gemm_ex(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      rocblas_datatype_f16_r,
+      lda,
+      B,
+      rocblas_datatype_f16_r,
+      ldb,
+      beta,
+      C,
+      rocblas_datatype_f16_r,
+      ldc,
+      C,
+      rocblas_datatype_f16_r,
+      ldc,
+      rocblas_datatype_f32_r,
+      rocblas_gemm_algo_standard,
+      0,
+      0);
+#else
  return cublasGemmEx(
      handle,
      transa,
@@ -163,6 +247,7 @@ cublasStatus_t mlp_gemm(
      ldc,
      CUDA_R_32F,
      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 }
 // Bias ADD. Assume input X is [features x batch size], column major.

--- a/setup.py
+++ b/setup.py
@@ -223,7 +223,13 @@ if "--cuda_ext" in sys.argv:
                              extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
                                                  'nvcc':['-O3'] + version_dependent_macros}))
        else:
-            print ("INFO: Skipping MLP extension")
+            print ("INFO: Building MLP extension")
+            ext_modules.append(
+                CUDAExtension(name='mlp_cuda',
+                              sources=['csrc/mlp.cpp',
+                                       'csrc/hip/mlp_hip.hip'],
+                              extra_compile_args={'cxx' : ['-O3'] + version_dependent_macros,
+                                                  'nvcc' : []}))
 if "--bnp" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension

--- a/tests/L0/run_mlp/test_mlp.py
+++ b/tests/L0/run_mlp/test_mlp.py
@@ -7,6 +7,7 @@ import torch
 from torch import nn
 from apex.mlp import MLP
+from apex.testing.common_utils import skipIfRocm
 batch_size = 1024
 mlp_sizes = [480, 1024, 1024, 512, 256, 1]
@@ -17,6 +18,7 @@ class TestMLP(unittest.TestCase):
    def test_creation(self):
        MLP(mlp_sizes)
+    @skipIfRocm
    def test_numeric(self):
        mlp = MLP(mlp_sizes).cuda()
@@ -51,6 +53,7 @@ class TestMLP(unittest.TestCase):
            ref_mlp[0].bias.grad.detach().cpu().numpy(),
            atol=1e-7, rtol=1e-5)
+    @skipIfRocm
    def test_no_bias(self):
        for use_activation in ['none', 'relu', 'sigmoid']:
            mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
@@ -88,6 +91,7 @@ class TestMLP(unittest.TestCase):
                ref_mlp[0].weight.grad.detach().cpu().numpy(),
                atol=1e-7, rtol=100)
+    @skipIfRocm
    def test_with_bias(self):
        for use_activation in ['none', 'relu', 'sigmoid']:
            mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
@@ -130,6 +134,7 @@ class TestMLP(unittest.TestCase):
                ref_mlp[0].bias.grad.detach().cpu().numpy(),
                atol=1e-7, rtol=1e-5)
+    @skipIfRocm
    def test_no_grad(self):
        mlp = MLP(mlp_sizes).cuda()
@@ -160,7 +165,7 @@ class TestMLP(unittest.TestCase):
            ref_mlp[0].weight.grad.detach().cpu().numpy(),
            atol=1e-7, rtol=1e-5)
+    @skipIfRocm
    def test_performance_half(self):
        mlp = MLP(mlp_sizes).cuda().half()

--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
@@ -9,7 +9,6 @@ ROCM_BLACKLIST = [
    'run_fused_layer_norm',
    'run_pyprof_nvtx',
    'run_pyprof_data',
-    'run_mlp'
 ]
 runner = unittest.TextTestRunner(verbosity=2)