push Deepspeed 0.6.3 rocm version

7d1a83a9 · aiss · ab5534fc · 7d1a83a9 · 7d1a83a9
Commit 7d1a83a9 authored May 25, 2022 by aiss
2 changed files
--- a/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
+++ b/csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#include <assert.h>
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <mma.h>
+#include <stdio.h>
+
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   cublasGemmAlgo_t algo)
+{
+    rocblas_status status = rocblas_gemmex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         hipR32F,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         hipR32F,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         C,
+                                         hipR32F,
+                                         m,
+                                         hipR32F,
+                                         algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   cublasGemmAlgo_t algo)
+{
+    rocblas_status status = rocblas_gemmex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         hipR16F,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         hipR16F,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         hipR16F,
+                                         m,
+                                         hipR32F,
+                                         algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    rocblas_status status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       hipR32F,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       hipR32F,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       hipR32F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       hipR32F,
+                                                       algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                cublasGemmAlgo_t algo)
+{
+    rocblas_status status = cublasGemmStridedBatchedEx(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       hipR16F,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       hipR16F,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       hipR16F,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       hipR32F,
+                                                       algo);
+
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+
+    return 0;
+}
--- a/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+#pragma once
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
+#include <cooperative_groups.h>
+#endif
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <iostream>
+
+#define MAX_WARP_NUM 32
+#define WARP_SIZE 32
+#define SMs 80
+
+#define MAX_REGISTERS 256
+template <typename T>
+void launch_attn_softmax_v2(T* vals,
+                            T* mask,
+                            bool triangular,
+                            bool recompute,
+                            bool local_attention,
+                            int window_size,
+                            int batch_size,
+                            int heads,
+                            int num_seq,
+                            int sequence_length,
+                            float scale,
+                            cudaStream_t stream);
+
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
+template <typename T>
+void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
+
+template <typename T>
+void launch_bias_residual(T* input,
+                          T* output,
+                          T* attn,
+                          T* bias,
+                          T* attn_bias,
+                          int batch,
+                          int hidden_dim,
+                          int mp_size,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_layer_norm(T* out,
+                       T* vals,
+                       const T* gamma,
+                       const T* beta,
+                       float epsilon,
+                       int batch_size,
+                       int hidden_dim,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_residual_layer_norm(T* norm,
+                                T* res_add,
+                                T* vals,
+                                T* residual,
+                                const T* bias,
+                                const T* gamma,
+                                const T* beta,
+                                float epsilon,
+                                int batch_size,
+                                int hidden_dim,
+                                bool preLN,
+                                bool mlp_after_attn,
+                                cudaStream_t stream);
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       unsigned merge_count,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_gptj_residual_add(T* input,
+                              T* output,
+                              T* attn,
+                              T* bias,
+                              T* attn_bias,
+                              int batch,
+                              int head_size,
+                              int mp_size,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_apply_rotary_pos_emb(T* mixed_query,
+                                 T* key_layer,
+                                 unsigned head_size,
+                                 unsigned seq_len,
+                                 unsigned rotary_dim,
+                                 unsigned offset,
+                                 unsigned num_heads,
+                                 unsigned batch,
+                                 bool rotate_half,
+                                 bool rotate_every_two,
+                                 cudaStream_t stream);
+
+template <typename T>
+void launch_moe_res_matmul(T* residual,
+                           T* coef,
+                           T* mlp_out,
+                           int seq_len,
+                           int hidden_dim,
+                           cudaStream_t stream);