Recover kernal files

7696cead · binmakeswell · Frank Lee · e83b2ce8 · 7696cead · 7696cead
Commit 7696cead authored Jul 13, 2022 by binmakeswell Committed by Frank Lee Jul 13, 2022
8 changed files
--- a/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu
-#include <cooperative_groups.h>
-
 #include <chrono>
 #include <ctime>

 #include "kernels.h"

+#include <cooperative_groups.h>
+
+
 namespace cg = cooperative_groups;

 curandStatePhilox4_32_10_t *curandstate;

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/kernels.h
@@ -3,11 +3,10 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
+#include <stdexcept>
 #include <stdio.h>
 #include <stdlib.h>

-#include <stdexcept>
-
 #define MAX_THREADS 1024
 #define WARP_SIZE 32

@@ -133,9 +132,8 @@ __forceinline__ __host__ __device__ int flat_3dim(int id1, int id2, int id3,
 }

 /* Convert 4-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int flat_4dim(int id1, int id2, int id3,
-                                                  int id4, int dim2, int dim3,
-                                                  int dim4) {
+__forceinline__ __host__ __device__ int
+flat_4dim(int id1, int id2, int id3, int id4, int dim2, int dim3, int dim4) {
  // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
  int res = id4;

@@ -203,9 +201,9 @@ __forceinline__ __host__ __device__ int flat_6dim(int id1, int id2, int id3,
 }

 /* Convert vector index to 6-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_6dim(
-    int src, int dim1, int dim2, int dim3, int dim4, int dim5, int *id0,
-    int *id1, int *id2, int *id3, int *id4, int *id5) {
+__forceinline__ __host__ __device__ void
+decompose_6dim(int src, int dim1, int dim2, int dim3, int dim4, int dim5,
+               int *id0, int *id1, int *id2, int *id3, int *id4, int *id5) {
  *id5 = src % dim5;
  src /= dim5;

@@ -223,11 +221,9 @@ __forceinline__ __host__ __device__ void decompose_6dim(
 }

 /* Convert vector index to 5-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_5dim(int src, int dim1,
-                                                        int dim2, int dim3,
-                                                        int dim4, int *id0,
-                                                        int *id1, int *id2,
-                                                        int *id3, int *id4) {
+__forceinline__ __host__ __device__ void
+decompose_5dim(int src, int dim1, int dim2, int dim3, int dim4, int *id0,
+               int *id1, int *id2, int *id3, int *id4) {
  *id4 = src % dim4;
  src /= dim4;

@@ -257,9 +253,8 @@ __forceinline__ __host__ __device__ void decompose_4dim(int src, int dim1,
 }

 /* Convert vector index to 3-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
-                                                        int dim2, int *id0,
-                                                        int *id1, int *id2) {
+__forceinline__ __host__ __device__ void
+decompose_3dim(int src, int dim1, int dim2, int *id0, int *id1, int *id2) {
  *id2 = src % dim2;
  src /= dim2;


--- a/colossalai/kernel/cuda_native/csrc/kernels/transform_kernels.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/transform_kernels.cu
@@ -135,10 +135,9 @@ __global__ void bias_add_transform_20314(T *output, const T *input,
                                         const T *bias, int dim_3, int dim_4);

 template <>
-__global__ void bias_add_transform_20314<float>(float *output,
-                                                const float *input,
-                                                const float *bias, int dim_3,
-                                                int dim_4) {
+__global__ void
+bias_add_transform_20314<float>(float *output, const float *input,
+                                const float *bias, int dim_3, int dim_4) {
  int id0 = blockIdx.x;
  int id1 = blockIdx.y;
  int id2 = blockIdx.z;
@@ -174,10 +173,9 @@ __global__ void bias_add_transform_20314<float>(float *output,
 }

 template <>
-__global__ void bias_add_transform_20314<__half>(__half *output,
-                                                 const __half *input,
-                                                 const __half *bias, int dim_3,
-                                                 int dim_4) {
+__global__ void
+bias_add_transform_20314<__half>(__half *output, const __half *input,
+                                 const __half *bias, int dim_3, int dim_4) {
  int id0 = blockIdx.x;
  int id1 = blockIdx.y;
  int id2 = blockIdx.z;

--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
-// modified from
-// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
-#include <assert.h>
 #include <c10/cuda/CUDAGuard.h>
-
 #include "compat.h"

+#include <assert.h>
+
 // #include <iostream>

 // This header is the one-stop shop for all your multi-tensor apply needs.
@@ -18,108 +17,117 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};

 template <int n>
-struct TensorListMetadata {
-  void *addresses[n][depth_to_max_tensors[n - 1]];
-  int sizes[depth_to_max_tensors[n - 1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-  int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a
-                                                   // full int.
-  int start_tensor_this_launch;
+struct TensorListMetadata
+{
+    void *addresses[n][depth_to_max_tensors[n - 1]];
+    int sizes[depth_to_max_tensors[n - 1]];
+    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+    int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
+    int start_tensor_this_launch;
 };

 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int *noop_flag, T tl,
-                                          U callable, ArgTypes... args) {
-  // Hand the chunk information to the user-supplied functor to process however
-  // it likes.
-  callable(chunk_size, noop_flag, tl, args...);
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    volatile int *noop_flag,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+    // Hand the chunk information to the user-supplied functor to process however it likes.
+    callable(chunk_size, noop_flag, tl, args...);
 }

 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
-    int block_size, int chunk_size, const at::Tensor &noop_flag,
-    const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
-    ArgTypes... args) {
-  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-  int len0 = tensor_lists[0].size();
-  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-  auto ref_device = tensor_lists[0][0].device();
-  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-  for (int l = 0; l < tensor_lists.size();
-       l++)  // No range-based for because I need indices
-  {
-    TORCH_CHECK(tensor_lists[l].size() == len0,
-                "Size mismatch among tensor lists");
-    for (int t = 0; t < tensor_lists[l].size(); t++) {
-      // TODO:  Print which tensor fails.
-      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+    int block_size,
+    int chunk_size,
+    const at::Tensor &noop_flag,
+    const std::vector<std::vector<at::Tensor>> &tensor_lists,
+    T callable,
+    ArgTypes... args)
+{
+    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+    int len0 = tensor_lists[0].size();
+    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+    auto ref_device = tensor_lists[0][0].device();
+    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+    for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
+    {
+        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+        for (int t = 0; t < tensor_lists[l].size(); t++)
+        {
+            // TODO:  Print which tensor fails.
+            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
 #ifdef VERSION_GE_1_5
-      contiguous_memory =
-          (contiguous_memory ||
-           tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+            contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 #endif
-      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-      TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                  "A tensor was not on the same device as the first tensor");
-      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
-                  "Size mismatch");
+            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+            TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+        }
    }
-  }
-
-  int ntensors = tensor_lists[0].size();
-
-  TensorListMetadata<depth> tl;
-
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tl.start_tensor_this_launch = 0;
-  int loc_block_info = 0;
-  int loc_tensor_info = 0;
-  for (int t = 0; t < ntensors; t++) {
-    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-    for (int d = 0; d < depth; d++)
-      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-    loc_tensor_info++;
-
-    int chunks_this_tensor =
-        (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-    for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-      // std::cout << chunks_this_tensor << std::endl;
-      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-      tl.block_to_chunk[loc_block_info] = chunk;
-      loc_block_info++;
-
-      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                           chunk == chunks_this_tensor - 1);
-      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-      if (tensors_full || blocks_full || last_chunk) {
-        // using accscalar_t = acc_type<scalar_t, true>;
-        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-            chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-
-        AT_CUDA_CHECK(cudaGetLastError());
-
-        // Reset.  The control flow possibilities here make my brain hurt.
-        loc_block_info = 0;
-        if (chunk == chunks_this_tensor - 1) {
-          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
-          // << std::endl;
-          loc_tensor_info = 0;
-          tl.start_tensor_this_launch = t + 1;
-        } else {
-          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
-          // << std::endl;
-          tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-          for (int d = 0; d < depth; d++)
-            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-          loc_tensor_info = 1;
-          tl.start_tensor_this_launch = t;
+
+    int ntensors = tensor_lists[0].size();
+
+    TensorListMetadata<depth> tl;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tl.start_tensor_this_launch = 0;
+    int loc_block_info = 0;
+    int loc_tensor_info = 0;
+    for (int t = 0; t < ntensors; t++)
+    {
+        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+        for (int d = 0; d < depth; d++)
+            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+        loc_tensor_info++;
+
+        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+        for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
+        {
+            // std::cout << chunks_this_tensor << std::endl;
+            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+            tl.block_to_chunk[loc_block_info] = chunk;
+            loc_block_info++;
+
+            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                                 chunk == chunks_this_tensor - 1);
+            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+            if (tensors_full || blocks_full || last_chunk)
+            {
+                // using accscalar_t = acc_type<scalar_t, true>;
+                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+                    chunk_size,
+                    noop_flag.DATA_PTR<int>(),
+                    tl,
+                    callable,
+                    args...);
+
+                AT_CUDA_CHECK(cudaGetLastError());
+
+                // Reset.  The control flow possibilities here make my brain hurt.
+                loc_block_info = 0;
+                if (chunk == chunks_this_tensor - 1)
+                {
+                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+                    loc_tensor_info = 0;
+                    tl.start_tensor_this_launch = t + 1;
+                }
+                else
+                {
+                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+                    for (int d = 0; d < depth; d++)
+                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+                    loc_tensor_info = 1;
+                    tl.start_tensor_this_launch = t;
+                }
+            }
        }
-      }
    }
-  }
 }
\ No newline at end of file
--- a/colossalai/kernel/cuda_native/csrc/scaled_masked_softmax.cpp
+++ b/colossalai/kernel/cuda_native/csrc/scaled_masked_softmax.cpp
@@ -3,68 +3,82 @@

 #include <cuda_fp16.h>
 #include <torch/extension.h>
-
 #include <vector>

 namespace multihead_attn {
 namespace fused_softmax {
 namespace scaled_masked_softmax {

-torch::Tensor fwd_cuda(torch::Tensor const& input, torch::Tensor const& mask,
-                       float scale_factor);
-
-torch::Tensor bwd_cuda(torch::Tensor const& output_grads,
-                       torch::Tensor const& softmax_results,
-                       float scale_factor);
-
-int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches,
-                             int attn_heads);
-
-torch::Tensor fwd(torch::Tensor const& input, torch::Tensor const& mask,
-                  float scale_factor) {
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    torch::Tensor const& mask,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+int get_batch_per_block_cuda(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads);
+
+torch::Tensor fwd(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor) {
  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
-                 (input.scalar_type() == at::ScalarType::BFloat16),
-             "Only fp16 and bf16 are supported");
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");

  return fwd_cuda(input, mask, scale_factor);
 }

-torch::Tensor bwd(torch::Tensor const& output_grads,
-                  torch::Tensor const& softmax_results, float scale_factor) {
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");

  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
-                 (output_grads.scalar_type() == at::ScalarType::BFloat16),
-             "Only fp16 and bf16 are supported");
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
-                 (softmax_results.scalar_type() == at::ScalarType::BFloat16),
-             "Only fp16 and bf16 are supported");
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");

  return bwd_cuda(output_grads, softmax_results, scale_factor);
 }

-int get_batch_per_block(int query_seq_len, int key_seq_len, int batches,
-                        int attn_heads) {
-  return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches,
-                                  attn_heads);
+int get_batch_per_block(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads) {
+    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
 }

-}  // end namespace scaled_masked_softmax
-}  // end namespace fused_softmax
-}  // end namespace multihead_attn
+} // end namespace scaled_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::fused_softmax::scaled_masked_softmax::fwd,
-        "Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");

-  m.def("backward", &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
-        "Self Multihead Attention scaled, time masked softmax -- Backward.");
+  m.def("backward",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");

  m.def("get_batch_per_block",
-        &multihead_attn::fused_softmax::scaled_masked_softmax::
-            get_batch_per_block,
-        "Return Batch per block size.");
+        &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
+        "Return Batch per block size."
+  );
 }
--- a/colossalai/kernel/cuda_native/csrc/scaled_masked_softmax.h
+++ b/colossalai/kernel/cuda_native/csrc/scaled_masked_softmax.h
--- a/colossalai/kernel/cuda_native/csrc/scaled_upper_triang_masked_softmax.h
+++ b/colossalai/kernel/cuda_native/csrc/scaled_upper_triang_masked_softmax.h
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h