refactor kernel (#142)

f68eddfb · ver217 · GitHub · 4a3d3446 · f68eddfb · f68eddfb
Unverified Commit f68eddfb authored Jan 13, 2022 by ver217 Committed by GitHub Jan 13, 2022
20 changed files
--- a/MANIFEST.in
+++ b/MANIFEST.in
 include *.txt README.md
 recursive-include requirements *.txt
 recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc
-recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
\ No newline at end of file
\ No newline at end of file
--- a/colossalai/kernel/__init__.py
+++ b/colossalai/kernel/__init__.py
-from .jit.bias_dropout_add import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
-from .jit.bias_gelu import bias_gelu_impl
 from .cuda_native import LayerNorm, FusedScaleMaskSoftmax, MultiHeadAttention
 __all__ = [
-    "bias_dropout_add_fused_train", "bias_dropout_add_fused_inference", "bias_gelu_impl",
    "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention"
 ]
--- a/colossalai/kernel/cuda_native/__init__.py
+++ b/colossalai/kernel/cuda_native/__init__.py
-from .builder import _build_cuda_native_kernel
-CUDA_NATIVE_KERNEL_BUILD = False
-def build_cuda_native_kernel():
-    global CUDA_NATIVE_KERNEL_BUILD
-    if CUDA_NATIVE_KERNEL_BUILD == False:
-        _build_cuda_native_kernel()
-        CUDA_NATIVE_KERNEL_BUILD = True
-build_cuda_native_kernel()
 from .layer_norm import MixedFusedLayerNorm as LayerNorm
 from .scaled_softmax import FusedScaleMaskSoftmax
 from .multihead_attention import MultiHeadAttention
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/builder.py
+++ b/colossalai/kernel/cuda_native/builder.py
-import os
-import pathlib
-import subprocess
-from torch.utils import cpp_extension
-# Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
-# leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
-# extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-def _build_cuda_native_kernel():
-    # Check if cuda 11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
-    if int(bare_metal_major) >= 11:
-        cc_flag.append('-gencode')
-        cc_flag.append('arch=compute_80,code=sm_80')
-    # Build path
-    basepath = pathlib.Path(__file__).parent.absolute()
-    srcpath = basepath / 'csrc'
-    buildpath = basepath / 'build'
-    _create_build_dir(buildpath)
-    # Helper function to build the kernels.
-    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
-        return cpp_extension.load(
-            name=name,
-            sources=sources,
-            build_directory=buildpath,
-            extra_cflags=[
-                '-O3',
-            ],
-            extra_include_paths=[str(srcpath / 'kernels' / 'include')],
-            extra_cuda_cflags=['-O3', '-gencode', 'arch=compute_70,code=sm_70', '--use_fast_math'] +
-            extra_cuda_flags + cc_flag,
-            verbose=False)
-    # ==============
-    # Fused softmax.
-    # ==============
-    extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__',
-                        '--expt-relaxed-constexpr',
-                        '--expt-extended-lambda']
-    # Upper triangular softmax.
-    sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
-                srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
-    colossal_scaled_upper_triang_masked_softmax = _cpp_extention_load_helper(
-        "colossal_scaled_upper_triang_masked_softmax",
-        sources, extra_cuda_flags)
-    # Masked softmax.
-    sources=[srcpath / 'scaled_masked_softmax.cpp',
-                srcpath / 'scaled_masked_softmax_cuda.cu']
-    colossal_scaled_masked_softmax = _cpp_extention_load_helper(
-        "colossal_scaled_masked_softmax", sources, extra_cuda_flags)
-    # =================================
-    # Mixed precision fused layer norm.
-    # =================================
-    extra_cuda_flags = ['-maxrregcount=50']
-    sources = [srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu']
-    colossal_layer_norm_cuda = _cpp_extention_load_helper("colossal_layer_norm_cuda", sources,
-                                                          extra_cuda_flags)
-    # ==========================================
-    # Mixed precision Transformer Encoder Layer.
-    # ==========================================
-    extra_cuda_flags = ['-std=c++14',
-                        '-U__CUDA_NO_HALF_OPERATORS__',
-                        '-U__CUDA_NO_HALF_CONVERSIONS__',
-                        '-U__CUDA_NO_HALF2_OPERATORS__',
-                        '-DTHRUST_IGNORE_CUB_VERSION_CHECK']
-    sources = [srcpath / 'multihead_attention_1d.cpp']
-    kernel_sources = ["cublas_wrappers.cu",
-                      "transform_kernels.cu",
-                      "dropout_kernels.cu",
-                      "normalize_kernels.cu",
-                      "softmax_kernels.cu",
-                      "general_kernels.cu",
-                      "cuda_util.cu"]
-    sources += [(srcpath / 'kernels' / cu_file) for cu_file in kernel_sources]
-    colossal_multihead_attention = _cpp_extention_load_helper("colossal_multihead_attention", sources,
-                                                          extra_cuda_flags)
-def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-    return raw_output, bare_metal_major, bare_metal_minor
-def _create_build_dir(buildpath):
-    try:
-        os.mkdir(buildpath)
-    except OSError:
-        if not os.path.isdir(buildpath):
-            print(f"Creation of the build directory {buildpath} failed")
--- a/csrc/colossal_C_frontend.cpp
+++ b/csrc/colossal_C_frontend.cpp
--- a/colossalai/kernel/cuda_native/csrc/compat.h
+++ b/colossalai/kernel/cuda_native/csrc/compat.h
-/*This code from NVIDIA apex:
+// modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h
- *     https://github.com/NVIDIA/apex
- *     with minor changes. */
 #ifndef TORCH_CHECK
 #define TORCH_CHECK AT_CHECK
 #endif

--- a/csrc/multi_tensor_adam.cu
+++ b/csrc/multi_tensor_adam.cu
--- a/csrc/multi_tensor_apply.cuh
+++ b/csrc/multi_tensor_apply.cuh
--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
--- a/csrc/multi_tensor_lamb.cu
+++ b/csrc/multi_tensor_lamb.cu
--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
--- a/csrc/multi_tensor_sgd_kernel.cu
+++ b/csrc/multi_tensor_sgd_kernel.cu
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@@ -71,3 +71,202 @@
    default:								\
      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
    }
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+// struct TypeShim
+// {
+//   const at::Type& payload;
+//   TypeShim(const at::Type& type) : payload(type) {}
+//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
+//   operator const at::Type&(){ return payload; };
+//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   //operator at::ScalarType(){ return payload.; };
+// };
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                 \
+    switch (TYPE)                                                       \
+    {                                                                   \
+    case at::ScalarType::Float:                                         \
+    {                                                                   \
+        using scalar_t_##LEVEL = float;                                 \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Half:                                          \
+    {                                                                   \
+        using scalar_t_##LEVEL = at::Half;                              \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    default:                                                            \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...)            \
+    switch (TYPE)                                                       \
+    {                                                                   \
+    case at::ScalarType::Float:                                         \
+    {                                                                   \
+        using scalar_t_##LEVEL = float;                                 \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Half:                                          \
+    {                                                                   \
+        using scalar_t_##LEVEL = at::Half;                              \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Byte:                                          \
+    {                                                                   \
+        using scalar_t_##LEVEL = uint8_t;                               \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    default:                                                            \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)          \
+    switch (TYPE)                                                       \
+    {                                                                   \
+    case at::ScalarType::Double:                                        \
+    {                                                                   \
+        using scalar_t_##LEVEL = double;                                \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Float:                                         \
+    {                                                                   \
+        using scalar_t_##LEVEL = float;                                 \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Half:                                          \
+    {                                                                   \
+        using scalar_t_##LEVEL = at::Half;                              \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    default:                                                            \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)               \
+    switch (TYPE)                                                       \
+    {                                                                   \
+    case at::ScalarType::Double:                                        \
+    {                                                                   \
+        using scalar_t_##LEVEL = double;                                \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    case at::ScalarType::Float:                                         \
+    {                                                                   \
+        using scalar_t_##LEVEL = float;                                 \
+        __VA_ARGS__;                                                    \
+        break;                                                          \
+    }                                                                   \
+    default:                                                            \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+template <typename T>
+__device__ __forceinline__ T reduce_block_into_lanes(T *x,
+                                                     T val,
+                                                     int lanes = 1,
+                                                     bool share_result = false) // lanes is intended to be <= 32.
+{
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    int blockSize = blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.
+    if (blockSize >= 64)
+    {
+        x[tid] = val;
+        __syncthreads();
+    }
+#pragma unroll
+    for (int i = (blockSize >> 1); i >= 64; i >>= 1)
+    {
+        if (tid < i)
+            x[tid] = x[tid] + x[tid + i];
+        __syncthreads();
+    }
+    T final;
+    if (tid < 32)
+    {
+        if (blockSize >= 64)
+            final = x[tid] + x[tid + 32];
+        else
+            final = val;
+            // __SYNCWARP();
+#pragma unroll
+        for (int i = 16; i >= lanes; i >>= 1)
+            final = final + __shfl_down_sync(0xffffffff, final, i);
+    }
+    if (share_result)
+    {
+        if (tid < lanes)
+            x[tid] = final; // EpilogueOp
+        // Make sure the smem result is visible to all warps.
+        __syncthreads();
+    }
+    return final;
+}
+template <typename T>
+__device__ __forceinline__ T reduce_block_into_lanes_max_op(T *x,
+                                                            T val,
+                                                            int lanes = 1,
+                                                            bool share_result = false) // lanes is intended to be <= 32.
+{
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    int blockSize = blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.
+    if (blockSize >= 64)
+    {
+        x[tid] = val;
+        __syncthreads();
+    }
+#pragma unroll
+    for (int i = (blockSize >> 1); i >= 64; i >>= 1)
+    {
+        if (tid < i)
+            x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
+        __syncthreads();
+    }
+    T final;
+    if (tid < 32)
+    {
+        if (blockSize >= 64)
+            final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
+        else
+            final = val;
+            // __SYNCWARP();
+#pragma unroll
+        for (int i = 16; i >= lanes; i >>= 1)
+            final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
+    }
+    if (share_result)
+    {
+        if (tid < lanes)
+            x[tid] = final; // EpilogueOp
+        // Make sure the smem result is visible to all warps.
+        __syncthreads();
+    }
+    return final;
+}
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/layer_norm.py
+++ b/colossalai/kernel/cuda_native/layer_norm.py
@@ -34,10 +34,10 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
        input_, weight_, bias_, mean, invvar = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None
        grad_input, grad_weight, grad_bias \
-          = colossal_layer_norm_cuda.backward_affine(
+            = colossal_layer_norm_cuda.backward_affine(
-            grad_output.contiguous(), mean, invvar,
+                grad_output.contiguous(), mean, invvar,
-            input_, ctx.normalized_shape,
+                input_, ctx.normalized_shape,
-            weight_, bias_, ctx.eps)
+                weight_, bias_, ctx.eps)
        return grad_input, grad_weight, grad_bias, None, None
@@ -48,7 +48,11 @@ class MixedFusedLayerNorm(torch.nn.Module):
        super(MixedFusedLayerNorm, self).__init__()
        global colossal_layer_norm_cuda
-        colossal_layer_norm_cuda = importlib.import_module("colossal_layer_norm_cuda")
+        if colossal_layer_norm_cuda is None:
+            try:
+                colossal_layer_norm_cuda = importlib.import_module("colossal_layer_norm_cuda")
+            except ImportError:
+                raise RuntimeError('MixedFusedLayerNorm requires cuda extensions')
        if isinstance(normalized_shape, numbers.Integral):
            normalized_shape = (normalized_shape,)

--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@@ -34,6 +34,7 @@ def calc_offset(sizes):
 colossal_multihead_attention = None
 @dataclass
 class Config:
    max_batch_tokens: int  # max batch token numbers
@@ -94,7 +95,7 @@ class MultiHeadAttention1DFunc(Function):
            input_mask = input_mask.to(torch.half)
        grad_input, grad_in_proj_weight, grad_in_proj_bias, grad_out_proj_weight, \
            grad_out_proj_bias, grad_norm_weight, grad_norm_bias = backward_func(
-            ctx.config.layer_id, grad_output, output, input, input_mask, in_proj_weight, \
+                ctx.config.layer_id, grad_output, output, input, input_mask, in_proj_weight,
                in_proj_bias, out_proj_weight, out_proj_bias, norm_weight, norm_bias)
        return (grad_input, None, grad_in_proj_weight, grad_in_proj_bias, grad_out_proj_weight,
@@ -142,7 +143,10 @@ class MultiHeadAttention(nn.Module):
        # Load cuda modules if needed
        global colossal_multihead_attention
        if colossal_multihead_attention is None:
-            colossal_multihead_attention = importlib.import_module("colossal_multihead_attention")
+            try:
+                colossal_multihead_attention = importlib.import_module("colossal_multihead_attention")
+            except ImportError:
+                raise RuntimeError('MultiHeadAttention requires cuda extensions')
        # create the layer in cuda kernels.
        cuda_module = colossal_multihead_attention
@@ -210,14 +214,14 @@ class MultiHeadAttention(nn.Module):
            with torch.no_grad():
                self.in_proj_weight.copy_(
                    attn_qkvw_global.view(3, hs, hs)[:,
-                                                    int(hs * rank_in_pg /
+                                                     int(hs * rank_in_pg /
-                                                        self.pg_size):int(hs * (rank_in_pg + 1) /
+                                                         self.pg_size):int(hs * (rank_in_pg + 1) /
-                                                                        self.pg_size), :])
+                                                                           self.pg_size), :])
                self.in_proj_bias.copy_(
                    attn_qkvb_global.view(3, hs)[:,
-                                                int(hs * rank_in_pg /
+                                                 int(hs * rank_in_pg /
-                                                    self.pg_size):int(hs * (rank_in_pg + 1) /
+                                                     self.pg_size):int(hs * (rank_in_pg + 1) /
-                                                                    self.pg_size)])
+                                                                       self.pg_size)])
            attn_ow_global = torch.empty(hs, hs)
            nn.init.xavier_uniform_(attn_ow_global, 1.0)
@@ -226,9 +230,9 @@ class MultiHeadAttention(nn.Module):
            attn_ow_global = attn_ow_global.cpu()
            with torch.no_grad():
                self.out_proj_weight.copy_(attn_ow_global[:,
-                                                      int(hs * rank_in_pg /
+                                                          int(hs * rank_in_pg /
-                                                          self.pg_size):int(hs * (rank_in_pg + 1) /
+                                                              self.pg_size):int(hs * (rank_in_pg + 1) /
-                                                                            self.pg_size)])
+                                                                                self.pg_size)])
        else:
            attn_qkvw = self.in_proj_weight.view(-1, hs)

--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@@ -21,7 +21,10 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, scale):
-        import colossal_scaled_upper_triang_masked_softmax
+        try:
+            import colossal_scaled_upper_triang_masked_softmax
+        except ImportError:
+            raise RuntimeError('ScaledUpperTriangMaskedSoftmax requires cuda extensions')
        scale_t = torch.tensor([scale])
        softmax_results = colossal_scaled_upper_triang_masked_softmax.forward(
@@ -33,7 +36,10 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    @staticmethod
    def backward(ctx, output_grads):
-        import colossal_scaled_upper_triang_masked_softmax
+        try:
+            import colossal_scaled_upper_triang_masked_softmax
+        except ImportError:
+            raise RuntimeError('ScaledUpperTriangMaskedSoftmax requires cuda extensions')
        softmax_results, scale_t = ctx.saved_tensors
        input_grads = colossal_scaled_upper_triang_masked_softmax.backward(
@@ -53,7 +59,10 @@ class ScaledMaskedSoftmax(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, mask, scale):
-        import colossal_scaled_masked_softmax
+        try:
+            import colossal_scaled_masked_softmax
+        except ImportError:
+            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
        scale_t = torch.tensor([scale])
@@ -63,7 +72,10 @@ class ScaledMaskedSoftmax(torch.autograd.Function):
    @staticmethod
    def backward(ctx, output_grads):
-        import colossal_scaled_masked_softmax
+        try:
+            import colossal_scaled_masked_softmax
+        except ImportError:
+            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
        softmax_results, scale_t = ctx.saved_tensors
@@ -179,6 +191,9 @@ class FusedScaleMaskSoftmax(nn.Module):
    @staticmethod
    def get_batch_per_block(sq, sk, b, np):
-        import colossal_scaled_masked_softmax
+        try:
+            import colossal_scaled_masked_softmax
+        except ImportError:
+            raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
        return colossal_scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
--- a/colossalai/kernel/jit/__init__.py
+++ b/colossalai/kernel/jit/__init__.py
 from .option import _set_jit_fusion_options
+from .bias_dropout_add import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
+from .bias_gelu import bias_gelu_impl
+_set_jit_fusion_options()
-_set_jit_fusion_options()
+__all__ = [
\ No newline at end of file
+    "bias_dropout_add_fused_train", "bias_dropout_add_fused_inference", "bias_gelu_impl",
+]
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -2,6 +2,7 @@ import torch
 JIT_OPTIONS_SET = False
 def _set_jit_fusion_options():
    """Set PyTorch JIT layer fusion options."""
    global JIT_OPTIONS_SET

--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -65,8 +65,7 @@ class FusedAdam(torch.optim.Optimizer):
            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
            self.multi_tensor_adam = colossal_C.multi_tensor_adam
        else:
-            raise RuntimeError(
+            raise RuntimeError('FusedAdam requires cuda extensions')
-                'apex.optimizers.FusedAdam requires cuda extensions')
    def zero_grad(self):
        if self.set_grad_none:

--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -73,8 +73,7 @@ class FusedLAMB(torch.optim.Optimizer):
                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
            self.multi_tensor_lamb = colossal_C.multi_tensor_lamb
        else:
-            raise RuntimeError(
+            raise RuntimeError('FusedLAMB requires cuda extensions')
-                'apex.optimizers.FusedLAMB requires cuda extensions')
        self.adam_w_mode = 1 if adam_w_mode else 0
        self.set_grad_none = set_grad_none