dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
Hide whitespace changes
Inline Side-by-side

Showing with 292 additions and 292 deletions

oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp ...sed_self_attention_query_mul_key_and_value_kernel.hip.cpp +292 -292

No files found.
--- a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
+++ b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/user/kernels/slice_util.h"
 #include "oneflow/core/kernel/new_kernel_util.h"
 #include "oneflow/core/ep/include/primitive/permute.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 inline hipblasOperation_t GetCublasOp(char op) {
  switch (op) {
    case 'n':
    case 'N': {
      return HIPBLAS_OP_N;
    }
    case 't':
    case 'T': {
      return HIPBLAS_OP_T;
    }
    case 'c':
    case 'C': {
      return HIPBLAS_OP_C;
    }
    default: {
      UNIMPLEMENTED();
    }
  }
  return HIPBLAS_OP_N;
 }
 template<typename T>
 struct CudaDataTypeTrait;
 template<>
 struct CudaDataTypeTrait<float> {
  const static hipblasDatatype_t value = HIPBLAS_R_32F;
 };
 template<>
 struct CudaDataTypeTrait<half> {
  const static hipblasDatatype_t value = HIPBLAS_R_16F;
 };
 template<typename T>
 void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
                     int64_t k, T alpha, const T* a, int64_t lda, int64_t stridea, const T* b,
                     int64_t ldb, int64_t strideb, T beta, T* c, int64_t ldc, int64_t stridec,
                     int64_t batch_size) {
  hipblasOperation_t opa = GetCublasOp(transa);
  hipblasOperation_t opb = GetCublasOp(transb);
  hipblasDatatype_t data_type = CudaDataTypeTrait<T>::value;
  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
      handle, opa, opb, m, n, k, reinterpret_cast<const void*>(&alpha),
      reinterpret_cast<const void*>(a), data_type, lda, stridea, reinterpret_cast<const void*>(b),
      data_type, ldb, strideb, reinterpret_cast<const void*>(&beta), reinterpret_cast<void*>(c),
      data_type, ldc, stridec, batch_size, data_type, HIPBLAS_GEMM_DEFAULT));
 }
 template<>
 void CublasBatchGemm<half>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
                           int64_t k, half alpha, const half* a, int64_t lda, int64_t stridea,
                           const half* b, int64_t ldb, int64_t strideb, half beta, half* c,
                           int64_t ldc, int64_t stridec, int64_t batch_size) {
  using comp_t = float;
  hipblasOperation_t opa = GetCublasOp(transa);
  hipblasOperation_t opb = GetCublasOp(transb);
  float alpha_f = static_cast<comp_t>(alpha);
  float beta_f = static_cast<comp_t>(beta);
  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
  hipblasDatatype_t data_type = CudaDataTypeTrait<half>::value;
  hipblasDatatype_t comp_type = CudaDataTypeTrait<comp_t>::value;
  OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
      handle, opa, opb, m, n, k, &alpha_f, reinterpret_cast<const void*>(a), data_type, lda,
      stridea, reinterpret_cast<const void*>(b), data_type, ldb, strideb, &beta_f,
      reinterpret_cast<void*>(c), data_type, ldc, stridec, batch_size, comp_type, algo));
 }
 template<>
 void CublasBatchGemm<float16>(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n,
                              int64_t k, float16 alpha, const float16* a, int64_t lda,
                              int64_t stridea, const float16* b, int64_t ldb, int64_t strideb,
                              float16 beta, float16* c, int64_t ldc, int64_t stridec,
                              int64_t batch_size) {
  CublasBatchGemm<half>(handle, transa, transb, m, n, k, static_cast<half>(alpha),
                        reinterpret_cast<const half*>(a), lda, stridea,
                        reinterpret_cast<const half*>(b), ldb, strideb, static_cast<half>(beta),
                        reinterpret_cast<half*>(c), ldc, stridec, batch_size);
 }
 template<typename T>
 void BatchedGemm(ep::Stream* stream, char opa, char opb, int64_t m, int64_t n, int64_t k,
                 float alpha, const T* a, int64_t lda, int64_t stridea, const T* b, int64_t ldb,
                 int64_t strideb, float beta, T* c, int64_t ldc, int64_t stridec,
                 int64_t batch_size) {
  // swap m and n, a and b to convert from row-major to col-major
  CublasBatchGemm<T>(stream->As<ep::CudaStream>()->cublas_handle(), opb, opa, n, m, k,
                     static_cast<T>(alpha), b, ldb, strideb, a, lda, stridea, static_cast<T>(beta),
                     c, ldc, stridec, batch_size);
 }
 SliceParams ConstructSliceParams4Value(int64_t seq_len, int64_t batch_size, int64_t num_heads,
                                       int64_t head_size) {
  // slice (s, b, n, 3, h) to (s, b, n, 1, h)
  SliceParams params;
  params.ndim = 4;
  params.dims[0] = seq_len;
  params.dims[1] = batch_size;
  params.dims[2] = num_heads;
  params.dims[3] = 3 * head_size;
  params.start[0] = 0;
  params.start[1] = 0;
  params.start[2] = 0;
  params.start[3] = 2 * head_size;
  params.step[0] = 1;
  params.step[1] = 1;
  params.step[2] = 1;
  params.step[3] = 1;
  params.size[0] = seq_len;
  params.size[1] = batch_size;
  params.size[2] = num_heads;
  params.size[3] = head_size;
  return params;
 }
 template<typename T>
 void TransposeGpu(ep::Stream* stream, DataType data_type, const ShapeView& in_shape,
                  const ShapeView& out_shape, const std::vector<int32_t>& perm, const T* in,
                  T* out) {
  CHECK_EQ(in_shape.NumAxes(), out_shape.NumAxes());
  int32_t num_axes = in_shape.NumAxes();
  CHECK_EQ(num_axes, perm.size());
  for (int i = 0; i < perm.size(); ++i) { CHECK_EQ(in_shape.At(perm[i]), out_shape.At(i)); }
  auto transpose = ep::primitive::NewPrimitive<ep::primitive::PermuteFactory>(stream->device_type(),
                                                                              in_shape.NumAxes());
  CHECK(transpose);
  transpose->Launch(stream, data_type, in_shape.NumAxes(), in_shape.ptr(), in, perm.data(), out);
 }
 template<typename T>
 class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpKernel {
 public:
  FusedSelfAttentionQueryMulKeyAndValueGpuKernel() = default;
  ~FusedSelfAttentionQueryMulKeyAndValueGpuKernel() override = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
    int64_t seq_len = h_tensor->shape_view().At(0);
    int64_t batch_size = h_tensor->shape_view().At(1);
    int64_t hidden_size = h_tensor->shape_view().At(2);
    int64_t head_size = ctx->Attr<int64_t>("head_size");
    int64_t num_heads = hidden_size / (3 * head_size);
    int64_t ld = batch_size * hidden_size;
    int64_t stride = 3 * head_size;
    int64_t k_offset = head_size;
    // q * k: (sq, b, n, h) x (sk, b, n, h) => (b, n, sq, h) x (b, n, sk, h)
    // => (b, n, sq, h) x (b, n, h, sk) -> (b, n, sq, sk)
    float alpha = ctx->Attr<float>("alpha");
    user_op::Tensor* qmk_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key", 0);
    const T* q_dptr = h_tensor->dptr<T>();
    const T* k_dptr = h_tensor->dptr<T>() + k_offset;
    BatchedGemm<T>(ctx->stream(), 'N', 'T', seq_len, seq_len, head_size, alpha, q_dptr, ld, stride,
                   k_dptr, ld, stride, 0.0f, qmk_tensor->mut_dptr<T>(), seq_len, seq_len * seq_len,
                   batch_size * num_heads);
    // slice v
    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    user_op::Tensor* v_tensor = ctx->Tensor4ArgNameAndIndex("value", 0);
    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
    SliceKernelUtil<DeviceType::kCUDA, T>::Forward(ctx->stream(), params, h_tensor->dptr<T>(),
                                                   tmp_v_tensor->mut_dptr<T>());
    // v from (s, b, n, h) transpose to (b, n, s, h)
    Shape value_shape({seq_len, batch_size, num_heads, head_size});
    TransposeGpu<T>(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(),
                    {1, 2, 0, 3}, tmp_v_tensor->dptr<T>(), v_tensor->mut_dptr<T>());
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<typename T>
 class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op::OpKernel {
 public:
  FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() = default;
  ~FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() override = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* v_grad_tensor = ctx->Tensor4ArgNameAndIndex("value_grad", 0);
    const user_op::Tensor* qmk_grad_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key_grad", 0);
    const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0);
    user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0);
    float alpha = ctx->Attr<float>("alpha");
    int64_t seq_len = h_grad_tensor->shape_view().At(0);
    int64_t batch_size = h_grad_tensor->shape_view().At(1);
    int64_t hidden_size = h_grad_tensor->shape_view().At(2);
    int64_t num_heads = v_grad_tensor->shape_view().At(1);
    int64_t head_size = v_grad_tensor->shape_view().At(3);
    int64_t ld = batch_size * hidden_size;
    int64_t stride = 3 * head_size;
    CHECK_EQ(hidden_size, num_heads * stride);
    // transpose from (b, n, s, h) to (s, b, n, h)
    Shape value_shape({seq_len, batch_size, num_heads, head_size});
    TransposeGpu<T>(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(),
                    value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr<T>(),
                    tmp_v_tensor->mut_dptr<T>());
    // slice v grad
    SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size);
    SliceKernelUtil<DeviceType::kCUDA, T>::Backward(ctx->stream(), params, tmp_v_tensor->dptr<T>(),
                                                    h_grad_tensor->mut_dptr<T>());
    // grad_q = grad_qmk * k
    // (b, n, sq, sk) x (b, n, sk, h) -> (b, n, s, h) <= (s, b, n, h) <= (s, b, n, 3, h)
    const T* qmk_grad_dptr = qmk_grad_tensor->dptr<T>();
    const T* k_dptr = h_tensor->dptr<T>() + head_size;
    T* grad_q_dptr = h_grad_tensor->mut_dptr<T>();
    BatchedGemm<T>(ctx->stream(), 'N', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
                   seq_len, seq_len * seq_len, k_dptr, ld, stride, 0.0f, grad_q_dptr, ld, stride,
                   batch_size * num_heads);
    // grad_k = grad_qmk * q
    // (b, n, sk, sq) x (b, n, sq, h) -> (b, n, sk, h) <= (s, b, n, h) <= (s, b, n, 3, h)
    const T* q_dptr = h_tensor->dptr<T>();
    T* grad_k_dptr = h_grad_tensor->mut_dptr<T>() + head_size;
    BatchedGemm<T>(ctx->stream(), 'T', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr,
                   seq_len, seq_len * seq_len, q_dptr, ld, stride, 0.0f, grad_k_dptr, ld, stride,
                   batch_size * num_heads);
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 size_t InferTmpBufferSize(user_op::InferContext* ctx) {
  const Shape* value_shape = ctx->OutputShape("value", 0);
  DataType value_dtype = *ctx->OutputDType("value", 0);
  return value_shape->elem_cnt() * GetSizeOfDataType(value_dtype);
 }
 size_t InferGradTmpBufferSize(user_op::InferContext* ctx) {
  const Shape& value_shape = ctx->InputShape("value_grad", 0);
  const DataType& value_dtype = ctx->InputDType("value_grad", 0);
  return value_shape.elem_cnt() * GetSizeOfDataType(value_dtype);
 }
 }  // namespace
 #define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(dtype)                   \
  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value")                             \
      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGpuKernel<dtype>>()                        \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
      .SetInferTmpSizeFn(InferTmpBufferSize);
 #define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(dtype)              \
  REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value_grad")                        \
      .SetCreateFn<FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel<dtype>>()                    \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
                       && (user_op::HobDataType("hidden_states", 0) == GetDataType<dtype>::value)) \
      .SetInferTmpSizeFn(InferGradTmpBufferSize);
 REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float)
 REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float16)
 REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float)
 REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float16)
 }  // namespace oneflow
\ No newline at end of file