Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template <typename T>
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
// Fused bias add with gelu activation
template <typename T>
void launch_bias_gelu(const T* input,
const T* bias,
T* output,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_gelu(const T* input,
T* output,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_d_gelu(T* d_output,
const T* input,
const T* bias,
int intermediate_size,
int batch_size,
hipStream_t stream);
// Custom fused bias add with layer normalization
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
hipStream_t stream,
bool preLayerNorm,
bool training,
T* vars,
T* means);
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
hipStream_t stream,
bool preLayerNorm,
bool training,
T* vars);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward_nreversible(const T* out_grad,
const T* vals,
const T* out_grad_trans,
const T* vals_trans,
const T* means,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, hipStream_t stream);
template <typename T>
void launch_attn_softmax_backward(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
hipStream_t stream);
template <typename T>
void launch_attn_softmax_backward_v2(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
hipStream_t stream);
// Custom softmax with scaling and attention mask addition
template <typename T>
void launch_attn_softmax(T* vals,
const T* attn_mask,
int batch_size,
int heads,
int sequence_length,
hipStream_t stream);
template <typename T>
void launch_transform_0213(T* output,
const T* vals,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
hipStream_t stream);
// Custom bias add
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
const T* vals,
const T* bias,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
hipStream_t stream,
int trans_count);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
hipStream_t stream,
int trans_count);
template <typename T>
void launch_dropout(T* vals,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
hipStream_t stream);
template <typename T>
void launch_dropout(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
int dim,
float ratio,
hipStream_t stream,
bool bwd = false);
template <typename T>
void launch_dropout(T* out,
const T* vals,
const T* residual,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
hipStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
float ratio,
hipStream_t stream);
template <typename T>
void launch_fuse_transpose_bias_kernel(const T* inp,
T* out,
int rows,
int cols,
hipStream_t stream);
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream);
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream);
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
template <typename T>
class Dropout {
public:
struct Config {
float ratio;
uint32_t dim;
bool training;
Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
float RATIO() const { return training ? ratio : 0.0; }
inline void SetDim(uint32_t d) { dim = d; }
};
Dropout(const Config& config) : _config(config), _mask(nullptr) {}
virtual ~Dropout() {}
void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
{
launch_dropout<T>(
out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
}
void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
{
launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void ForwardWithBias(int bsz,
T* out,
const T* vals,
const T* residual,
const T* bias,
cudaStream_t stream)
{
launch_dropout<T>(
out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals, cudaStream_t stream)
{
launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
{
launch_dropout_grad<T>(
d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
bool HasDropout() const { return _config.RATIO() > 0.0; }
void SetTrainingMode(bool training) { _config.training = training; }
void SetMask(uint8_t* mask)
{
if (!mask) { throw std::runtime_error("Dropout mask is null."); }
_mask = mask;
}
Config GetConfig() const { return _config; }
inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
private:
uint8_t* _mask;
Config _config;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template <typename T>
class Dropout {
public:
struct Config {
float ratio;
uint32_t dim;
bool training;
Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
float RATIO() const { return training ? ratio : 0.0; }
inline void SetDim(uint32_t d) { dim = d; }
};
Dropout(const Config& config) : _config(config), _mask(nullptr) {}
virtual ~Dropout() {}
void Forward(int bsz, T* out, const T* vals, hipStream_t stream, bool bwd = false)
{
launch_dropout<T>(
out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
}
void ForwardWithBias(int bsz, T* vals, const T* bias, hipStream_t stream)
{
launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void ForwardWithBias(int bsz,
T* out,
const T* vals,
const T* residual,
const T* bias,
hipStream_t stream)
{
launch_dropout<T>(
out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals, hipStream_t stream)
{
launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals_out, const T* d_vals, hipStream_t stream)
{
launch_dropout_grad<T>(
d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
bool HasDropout() const { return _config.RATIO() > 0.0; }
void SetTrainingMode(bool training) { _config.training = training; }
void SetMask(uint8_t* mask)
{
if (!mask) { throw std::runtime_error("Dropout mask is null."); }
_mask = mask;
}
Config GetConfig() const { return _config; }
inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
private:
uint8_t* _mask;
Config _config;
};
#pragma once
#include <cuda_runtime_api.h>
#include <curand.h>
#include <memory>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct BertGemmAlgos {
int m_gemm_qkv_algo;
int m_gemm_inter_algo;
int m_gemm_output_algo;
int m_gemm_batch1_algo;
int m_gemm_batch2_algo;
BertGemmAlgos()
: m_gemm_qkv_algo(-1),
m_gemm_inter_algo(-1),
m_gemm_output_algo(-1),
m_gemm_batch1_algo(-1),
m_gemm_batch2_algo(-1)
{
}
};
template <typename T>
class BertTransformerLayer {
public:
BertTransformerLayer(unsigned layer_id,
unsigned batch_size,
unsigned hidden_size,
unsigned num_heads,
unsigned intermediate_size,
unsigned seq_length,
float attn_dropout_ratio,
float hidden_output_dropout_ratio,
float layer_norm_eps,
bool pre_or_postLayerNorm,
const std::vector<std::array<int, 3>>& gemm_algos,
bool attn_dropout_checkpoint,
bool normalize_invertible,
bool gelu_checkpoint,
bool stochastic_mode);
virtual ~BertTransformerLayer();
void Forward(unsigned bsz,
const T* input_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_qkvb_ptr,
const T* attn_ow_ptr,
const T* attn_ob_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* output_b_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* out_ptr,
T* inp_norm_ptr,
T* q_tf_ptr,
T* k_tf_ptr,
T* v_tf_ptr,
T* softmax_output_ptr,
T* ctx_bufB_ptr,
T* attn_o_inp_ptr,
T* add_res_ptr,
T* ff1_inp_ptr,
T* gelu_inp_ptr,
T* ff2_inp_ptr);
void Backward(unsigned bsz,
const T* grad_output_ptr,
const T* input_ptr,
const T* output_ptr,
const T* inp_norm_ptr,
const T* q_tf_ptr,
const T* k_tf_ptr,
const T* v_tf_ptr,
const T* softmax_output_ptr,
const T* ctx_bufB_ptr,
const T* attn_o_inp_ptr,
const T* add_res_ptr,
const T* ff1_inp_ptr,
const T* gelu_inp_ptr,
const T* ff2_inp_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_ow_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* grad_input_ptr,
T* grad_attn_qkvw_ptr,
T* grad_attn_qkvb_ptr,
T* grad_attn_ow_ptr,
T* grad_attn_ob_ptr,
T* grad_attn_nw_ptr,
T* grad_attn_nb_ptr,
T* grad_inter_w_ptr,
T* grad_inter_b_ptr,
T* grad_output_w_ptr,
T* grad_output_b_ptr,
T* grad_norm_w_ptr,
T* grad_norm_b_ptr);
void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
uint8_t* attn_output_dropout_mask_ptr,
uint8_t* layer_output_dropout_mask_ptr,
T* layer_norm_var,
T* layer_norm_mean,
T* attn_layer_norm_var,
T* attn_layer_norm_mean);
inline unsigned GetBatchSize() const { return _batch_size; }
inline unsigned GetNumHeads() const { return _heads; }
inline unsigned GetSeqLength() const { return _seq_length; }
inline unsigned GetIntermediateSize() const { return _intermediate_size; }
void SetSeqLength(unsigned seq_len);
inline unsigned GetHiddenSize() const { return _hidden_size; }
void SetTrainingMode(bool training);
inline bool IsTrainingMode() const { return _training; }
inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
private:
void Initialize();
size_t getWorkspaceSize(int maxBatchSize) const;
// Params
unsigned _layer_id;
unsigned _batch_size;
unsigned _hidden_size;
unsigned _heads;
unsigned _size_per_head;
unsigned _intermediate_size;
unsigned _seq_length;
bool _pre_or_postLayerNorm;
cublasHandle_t _cublasHandle;
cudaStream_t _stream;
// layers
FeedForward<T> _qkv_linear;
FeedForward<T> _attn_out_linear;
Normalize_Layer<T> _attn_layer_norm;
Normalize_Layer<T> _layer_norm;
Normalize_Layer<T>* _last_normalize;
FeedForward<T> _ff1, _ff2;
Softmax<T> _softmax;
Gelu<T> _gelu;
Dropout<T> _attn_prob_dropout;
Dropout<T> _attn_output_dropout;
Dropout<T> _layer_output_dropout;
StridedBatchGemm<T> _attn_scores;
StridedBatchGemm<T> _attn_context;
bool _training;
// Memory saving flags
bool _attn_dropout_checkpoint;
bool _normalize_invertible;
bool _gelu_checkpoint;
// High Performance flags
bool _stochastic_mode;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand/hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout_hip.h"
#include "feed_forward_hip.h"
#include "gelu_hip.h"
#include "general_kernels_hip.h"
#include "normalize_layer_hip.h"
#include "softmax_hip.h"
#include "strided_batch_gemm_hip.h"
struct BertGemmAlgos {
int m_gemm_qkv_algo;
int m_gemm_inter_algo;
int m_gemm_output_algo;
int m_gemm_batch1_algo;
int m_gemm_batch2_algo;
BertGemmAlgos()
: m_gemm_qkv_algo(-1),
m_gemm_inter_algo(-1),
m_gemm_output_algo(-1),
m_gemm_batch1_algo(-1),
m_gemm_batch2_algo(-1)
{
}
};
template <typename T>
class BertTransformerLayer {
public:
BertTransformerLayer(unsigned layer_id,
unsigned batch_size,
unsigned hidden_size,
unsigned num_heads,
unsigned intermediate_size,
unsigned seq_length,
float attn_dropout_ratio,
float hidden_output_dropout_ratio,
float layer_norm_eps,
bool pre_or_postLayerNorm,
const std::vector<std::array<int, 3>>& gemm_algos,
bool attn_dropout_checkpoint,
bool normalize_invertible,
bool gelu_checkpoint,
bool stochastic_mode);
virtual ~BertTransformerLayer();
void Forward(unsigned bsz,
const T* input_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_qkvb_ptr,
const T* attn_ow_ptr,
const T* attn_ob_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* output_b_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* out_ptr,
T* inp_norm_ptr,
T* q_tf_ptr,
T* k_tf_ptr,
T* v_tf_ptr,
T* softmax_output_ptr,
T* ctx_bufB_ptr,
T* attn_o_inp_ptr,
T* add_res_ptr,
T* ff1_inp_ptr,
T* gelu_inp_ptr,
T* ff2_inp_ptr);
void Backward(unsigned bsz,
const T* grad_output_ptr,
const T* input_ptr,
const T* output_ptr,
const T* inp_norm_ptr,
const T* q_tf_ptr,
const T* k_tf_ptr,
const T* v_tf_ptr,
const T* softmax_output_ptr,
const T* ctx_bufB_ptr,
const T* attn_o_inp_ptr,
const T* add_res_ptr,
const T* ff1_inp_ptr,
const T* gelu_inp_ptr,
const T* ff2_inp_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_ow_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* grad_input_ptr,
T* grad_attn_qkvw_ptr,
T* grad_attn_qkvb_ptr,
T* grad_attn_ow_ptr,
T* grad_attn_ob_ptr,
T* grad_attn_nw_ptr,
T* grad_attn_nb_ptr,
T* grad_inter_w_ptr,
T* grad_inter_b_ptr,
T* grad_output_w_ptr,
T* grad_output_b_ptr,
T* grad_norm_w_ptr,
T* grad_norm_b_ptr);
void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
uint8_t* attn_output_dropout_mask_ptr,
uint8_t* layer_output_dropout_mask_ptr,
T* layer_norm_var,
T* layer_norm_mean,
T* attn_layer_norm_var,
T* attn_layer_norm_mean);
inline unsigned GetBatchSize() const { return _batch_size; }
inline unsigned GetNumHeads() const { return _heads; }
inline unsigned GetSeqLength() const { return _seq_length; }
inline unsigned GetIntermediateSize() const { return _intermediate_size; }
void SetSeqLength(unsigned seq_len);
inline unsigned GetHiddenSize() const { return _hidden_size; }
void SetTrainingMode(bool training);
inline bool IsTrainingMode() const { return _training; }
inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
private:
void Initialize();
size_t getWorkspaceSize(int maxBatchSize) const;
// Params
unsigned _layer_id;
unsigned _batch_size;
unsigned _hidden_size;
unsigned _heads;
unsigned _size_per_head;
unsigned _intermediate_size;
unsigned _seq_length;
bool _pre_or_postLayerNorm;
rocblas_handle _cublasHandle;
hipStream_t _stream;
// layers
FeedForward<T> _qkv_linear;
FeedForward<T> _attn_out_linear;
Normalize_Layer<T> _attn_layer_norm;
Normalize_Layer<T> _layer_norm;
Normalize_Layer<T>* _last_normalize;
FeedForward<T> _ff1, _ff2;
Softmax<T> _softmax;
Gelu<T> _gelu;
Dropout<T> _attn_prob_dropout;
Dropout<T> _attn_output_dropout;
Dropout<T> _layer_output_dropout;
StridedBatchGemm<T> _attn_scores;
StridedBatchGemm<T> _attn_context;
bool _training;
// Memory saving flags
bool _attn_dropout_checkpoint;
bool _normalize_invertible;
bool _gelu_checkpoint;
// High Performance flags
bool _stochastic_mode;
};
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template <typename T>
class FeedForward {
public:
struct Config {
int batchSize, outputSize;
int inputSize;
std::array<int, 3> gemm_algos;
Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
: batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
{
}
};
FeedForward(Config config) : config_(config) {}
~FeedForward() {}
void Forward(int bsz,
const T* input_ptr,
const T* weights,
T* out,
cublasHandle_t& _cublasHandle)
{
float alpha = T(1.);
float beta = T(0.);
cublas_gemm_ex(_cublasHandle,
CUBLAS_OP_T,
CUBLAS_OP_N,
config_.outputSize,
bsz,
config_.inputSize,
&alpha,
&beta,
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
#endif
}
void Backward(int bsz,
const T* out_grad,
const T* input_ptr,
const T* weights,
T* weights_grad,
T* bias_grad,
cublasHandle_t& _cublasHandle,
cudaStream_t& stream,
T* inp_grad_out = nullptr,
T* out_grad_trans_out = nullptr)
{
float alpha = (T)1.0, beta = (T)0.0;
cublas_gemm_ex(_cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_T,
config_.inputSize,
config_.outputSize,
bsz,
&alpha,
&beta,
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
#endif
cublas_gemm_ex(_cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
config_.inputSize,
bsz,
config_.outputSize,
&alpha,
&beta,
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
#endif
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
}
private:
Config config_;
};
#endif
// !!! This is a file automatically generated by hipify!!!
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template <typename T>
class FeedForward {
public:
struct Config {
int batchSize, outputSize;
int inputSize;
std::array<int, 3> gemm_algos;
Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
: batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
{
}
};
FeedForward(Config config) : config_(config) {}
~FeedForward() {}
void Forward(int bsz,
const T* input_ptr,
const T* weights,
T* out,
rocblas_handle& _cublasHandle)
{
float alpha = T(1.);
float beta = T(0.);
cublas_gemm_ex(_cublasHandle,
rocblas_operation_transpose,
rocblas_operation_none,
config_.outputSize,
bsz,
config_.inputSize,
&alpha,
&beta,
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
#endif
}
void Backward(int bsz,
const T* out_grad,
const T* input_ptr,
const T* weights,
T* weights_grad,
T* bias_grad,
rocblas_handle& _cublasHandle,
hipStream_t& stream,
T* inp_grad_out = nullptr,
T* out_grad_trans_out = nullptr)
{
float alpha = (T)1.0, beta = (T)0.0;
cublas_gemm_ex(_cublasHandle,
rocblas_operation_none,
rocblas_operation_transpose,
config_.inputSize,
config_.outputSize,
bsz,
&alpha,
&beta,
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
#endif
cublas_gemm_ex(_cublasHandle,
rocblas_operation_none,
rocblas_operation_none,
config_.inputSize,
bsz,
config_.outputSize,
&alpha,
&beta,
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
#endif
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
}
private:
Config config_;
};
#endif
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template <typename T>
class Gelu {
public:
struct Config {
uint32_t intermediate_size;
Config(uint32_t inter_size) : intermediate_size(inter_size) {}
};
Gelu(const Config& config) : _config(config) {}
virtual ~Gelu() {}
void ForwardWithBiasAdd(int bsz,
const T* input_buf,
const T* bias,
T* output,
cudaStream_t stream)
{
launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
}
void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
{
launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
}
private:
Config _config;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template <typename T>
class Gelu {
public:
struct Config {
uint32_t intermediate_size;
Config(uint32_t inter_size) : intermediate_size(inter_size) {}
};
Gelu(const Config& config) : _config(config) {}
virtual ~Gelu() {}
void ForwardWithBiasAdd(int bsz,
const T* input_buf,
const T* bias,
T* output,
hipStream_t stream)
{
launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
}
void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
{
launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
}
private:
Config _config;
};
#pragma once
#include <cuda_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template <typename T>
void check(T result, char const* const func, const char* const file, int const line)
{
if (result) {
std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
" \n");
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
class GemmTest {
public:
GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
: M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
}
~GemmTest()
{
check_cuda_error(cudaFree(A));
check_cuda_error(cudaFree(B));
check_cuda_error(cudaFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
N,
M,
K,
&alpha,
&beta,
B,
A,
C,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
K,
M,
N,
&alpha,
&beta,
B,
C,
A,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
cudaDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
cudaDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int M, N, K;
cublasHandle_t handle;
cublasOperation_t transa, transb;
T *A, *B, *C;
};
template <typename T>
class StridedGemmTest {
public:
StridedGemmTest(int b,
int m,
int n,
int k,
cublasOperation_t ta,
cublasOperation_t tb,
cublasHandle_t h)
: bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
}
~StridedGemmTest()
{
check_cuda_error(cudaFree(A));
check_cuda_error(cudaFree(B));
check_cuda_error(cudaFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
int stride_a = M * K;
int stride_b = N * K;
int stride_c = M * N;
cublas_strided_batched_gemm(handle,
M,
N,
K,
&alpha,
&beta,
A,
B,
C,
transa,
transb,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
int mb = (transa == CUBLAS_OP_T ? K : M);
int kb = (transa == CUBLAS_OP_T ? M : K);
int stride_a = mb * N;
int stride_b = N * kb;
int stride_c = M * K;
// B need to transpose.
cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
// Calculate d_A.
cublas_strided_batched_gemm(handle,
mb,
kb,
N,
&alpha,
&beta,
(transa == CUBLAS_OP_T ? B : C),
(transa == CUBLAS_OP_T ? C : B),
A,
CUBLAS_OP_N,
op_b,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
// A need to transpose.
cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
int stride_a = M * K;
int stride_b = M * N;
int stride_c = N * K;
// Calculate d_B.
cublas_strided_batched_gemm(handle,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
op_a,
CUBLAS_OP_N,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
cudaDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
cudaDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int bsz, M, N, K;
cublasHandle_t handle;
cublasOperation_t transa, transb;
T *A, *B, *C;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers_hip.h"
template <typename T>
void check(T result, char const* const func, const char* const file, int const line)
{
if (result) {
std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
" \n");
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
class GemmTest {
public:
GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
: M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
}
~GemmTest()
{
check_cuda_error(hipFree(A));
check_cuda_error(hipFree(B));
check_cuda_error(hipFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_transpose,
rocblas_operation_none,
N,
M,
K,
&alpha,
&beta,
B,
A,
C,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_none,
rocblas_operation_transpose,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_none,
rocblas_operation_none,
K,
M,
N,
&alpha,
&beta,
B,
C,
A,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
hipDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
hipDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int M, N, K;
rocblas_handle handle;
rocblas_operation transa, transb;
T *A, *B, *C;
};
template <typename T>
class StridedGemmTest {
public:
StridedGemmTest(int b,
int m,
int n,
int k,
rocblas_operation ta,
rocblas_operation tb,
rocblas_handle h)
: bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
}
~StridedGemmTest()
{
check_cuda_error(hipFree(A));
check_cuda_error(hipFree(B));
check_cuda_error(hipFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
int stride_a = M * K;
int stride_b = N * K;
int stride_c = M * N;
cublas_strided_batched_gemm(handle,
M,
N,
K,
&alpha,
&beta,
A,
B,
C,
transa,
transb,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
int mb = (transa == rocblas_operation_transpose ? K : M);
int kb = (transa == rocblas_operation_transpose ? M : K);
int stride_a = mb * N;
int stride_b = N * kb;
int stride_c = M * K;
// B need to transpose.
rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
// Calculate d_A.
cublas_strided_batched_gemm(handle,
mb,
kb,
N,
&alpha,
&beta,
(transa == rocblas_operation_transpose ? B : C),
(transa == rocblas_operation_transpose ? C : B),
A,
rocblas_operation_none,
op_b,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
// A need to transpose.
rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
int stride_a = M * K;
int stride_b = M * N;
int stride_c = N * K;
// Calculate d_B.
cublas_strided_batched_gemm(handle,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
op_a,
rocblas_operation_none,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
hipDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
hipDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int bsz, M, N, K;
rocblas_handle handle;
rocblas_operation transa, transb;
T *A, *B, *C;
};
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template <typename T>
void launch_fused_add2(T* out,
const T* inp1,
const T* inp2,
int batch_size,
int seq_length,
int hidden_size,
cudaStream_t& stream);
template <typename T>
void launch_fused_add4(T* out,
const T* inp1,
const T* inp2,
const T* inp3,
const T* inp4,
int batch_size,
int seq_length,
int hidden_size,
cudaStream_t& stream);
template <typename T>
void launch_fused_add3(T* out,
const T* inp1,
const T* inp2,
const T* inp3,
int batch_size,
int seq_length,
int hidden_size,
cudaStream_t& stream);
// !!! This is a file automatically generated by hipify!!!
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template <typename T>
void launch_fused_add2(T* out,
const T* inp1,
const T* inp2,
int batch_size,
int seq_length,
int hidden_size,
hipStream_t& stream);
template <typename T>
void launch_fused_add4(T* out,
const T* inp1,
const T* inp2,
const T* inp3,
const T* inp4,
int batch_size,
int seq_length,
int hidden_size,
hipStream_t& stream);
template <typename T>
void launch_fused_add3(T* out,
const T* inp1,
const T* inp2,
const T* inp3,
int batch_size,
int seq_length,
int hidden_size,
hipStream_t& stream);
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_cuda_layers.h"
using namespace std;
template <typename T>
class Normalize_Layer {
public:
struct Config {
uint32_t batchSize;
uint32_t seqLength;
uint32_t hiddenDim;
float epsilon;
bool training;
bool useMean;
Config(uint32_t batch,
uint32_t seq,
uint32_t h,
float epsilon = 1e-12,
bool training = true,
bool useMean = true)
: batchSize(batch),
seqLength(seq),
hiddenDim(h),
epsilon(epsilon),
training(training),
useMean(useMean)
{
}
};
Normalize_Layer(Config config)
: config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
{
}
~Normalize_Layer() {}
void ForwardCheckpoint(int bsz, // batch * seq
T* vals,
const T* residual,
const T* gamma,
const T* betta,
cudaStream_t& stream,
bool preLayerNorm = false)
{
launch_bias_residual_layer_norm(vals,
residual,
gamma,
betta,
config_.epsilon,
bsz,
config_.hiddenDim,
stream,
preLayerNorm,
config_.training,
vars,
means);
}
void Forward(int bsz,
T* vals,
const T* residual,
const T* gamma,
const T* betta,
cudaStream_t& stream,
bool preLayerNorm = false)
{
launch_bias_residual_layer_norm(vals,
residual,
gamma,
betta,
config_.epsilon,
bsz,
config_.hiddenDim,
stream,
preLayerNorm,
config_.training,
vars);
}
void Backward(int bsz,
const T* out_grad,
const T* gamma,
T* gamma_grad,
T* betta_grad,
cudaStream_t stream[2],
T* inp_grad_out,
const T* norm_in = nullptr)
{
launch_layerNorm_backward(out_grad,
norm_in,
vars,
means,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream);
}
void Backward(int bsz,
const T* out_grad,
const T* gamma,
const T* betta,
T* gamma_grad,
T* betta_grad,
cudaStream_t stream[2],
T* inp_grad_out,
const T* norm_out)
{
launch_layerNorm_backward(out_grad,
norm_out,
vars,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream,
!config_.useMean,
betta);
}
void BackwardFusedAdd(int bsz,
const T* out_grad1,
const T* out_grad2,
const T* gamma,
T* gamma_grad,
T* betta_grad,
cudaStream_t stream[2],
T* inp_grad_out,
const T* norm_in = nullptr)
{
launch_layerNorm_backward_fused_add(out_grad1,
out_grad2,
norm_in,
vars,
means,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream);
}
void BackwardFusedAdd(int bsz,
const T* out_grad1,
const T* out_grad2,
const T* gamma,
const T* betta,
T* gamma_grad,
T* betta_grad,
cudaStream_t stream[2],
T* inp_grad_out,
const T* norm_out)
{
launch_layerNorm_backward_fused_add(out_grad1,
out_grad2,
norm_out,
vars,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream,
!config_.useMean,
betta);
}
inline bool UseMean() const { return config_.useMean; }
inline void SetVar(T* variance)
{
if (!variance) { throw std::runtime_error("Normalize variance is null."); }
vars = variance;
}
inline void SetMean(T* mean)
{
if (!mean) { throw std::runtime_error("Normalize mean is null."); }
means = mean;
}
private:
Config config_;
T* vars;
T* means;
T* vals_hat;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_hip_layers.h"
using namespace std;
template <typename T>
class Normalize_Layer {
public:
struct Config {
uint32_t batchSize;
uint32_t seqLength;
uint32_t hiddenDim;
float epsilon;
bool training;
bool useMean;
Config(uint32_t batch,
uint32_t seq,
uint32_t h,
float epsilon = 1e-12,
bool training = true,
bool useMean = true)
: batchSize(batch),
seqLength(seq),
hiddenDim(h),
epsilon(epsilon),
training(training),
useMean(useMean)
{
}
};
Normalize_Layer(Config config)
: config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
{
}
~Normalize_Layer() {}
void ForwardCheckpoint(int bsz, // batch * seq
T* vals,
const T* residual,
const T* gamma,
const T* betta,
hipStream_t& stream,
bool preLayerNorm = false)
{
launch_bias_residual_layer_norm(vals,
residual,
gamma,
betta,
config_.epsilon,
bsz,
config_.hiddenDim,
stream,
preLayerNorm,
config_.training,
vars,
means);
}
void Forward(int bsz,
T* vals,
const T* residual,
const T* gamma,
const T* betta,
hipStream_t& stream,
bool preLayerNorm = false)
{
launch_bias_residual_layer_norm(vals,
residual,
gamma,
betta,
config_.epsilon,
bsz,
config_.hiddenDim,
stream,
preLayerNorm,
config_.training,
vars);
}
void Backward(int bsz,
const T* out_grad,
const T* gamma,
T* gamma_grad,
T* betta_grad,
hipStream_t stream[2],
T* inp_grad_out,
const T* norm_in = nullptr)
{
launch_layerNorm_backward(out_grad,
norm_in,
vars,
means,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream);
}
void Backward(int bsz,
const T* out_grad,
const T* gamma,
const T* betta,
T* gamma_grad,
T* betta_grad,
hipStream_t stream[2],
T* inp_grad_out,
const T* norm_out)
{
launch_layerNorm_backward(out_grad,
norm_out,
vars,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream,
!config_.useMean,
betta);
}
void BackwardFusedAdd(int bsz,
const T* out_grad1,
const T* out_grad2,
const T* gamma,
T* gamma_grad,
T* betta_grad,
hipStream_t stream[2],
T* inp_grad_out,
const T* norm_in = nullptr)
{
launch_layerNorm_backward_fused_add(out_grad1,
out_grad2,
norm_in,
vars,
means,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream);
}
void BackwardFusedAdd(int bsz,
const T* out_grad1,
const T* out_grad2,
const T* gamma,
const T* betta,
T* gamma_grad,
T* betta_grad,
hipStream_t stream[2],
T* inp_grad_out,
const T* norm_out)
{
launch_layerNorm_backward_fused_add(out_grad1,
out_grad2,
norm_out,
vars,
gamma,
gamma_grad,
betta_grad,
inp_grad_out,
bsz,
config_.hiddenDim,
stream,
!config_.useMean,
betta);
}
inline bool UseMean() const { return config_.useMean; }
inline void SetVar(T* variance)
{
if (!variance) { throw std::runtime_error("Normalize variance is null."); }
vars = variance;
}
inline void SetMean(T* mean)
{
if (!mean) { throw std::runtime_error("Normalize mean is null."); }
means = mean;
}
private:
Config config_;
T* vars;
T* means;
T* vals_hat;
};
#pragma once
#include <cooperative_groups.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
#pragma once
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#define TILE (128 * 1024 * 1024)
#if defined(__AVX512__) or defined(__AVX256__)
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#define SIMD_LOAD2(x, h) \
((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
#define SIMD_STORE2(x, d, h) \
((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
: _mm512_storeu_ps(x, d))
#define INTV __m256i
#elif defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#define SIMD_LOAD2(x, h) \
((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
#define SIMD_STORE2(x, d, h) \
((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
: _mm256_storeu_ps(x, d))
#define INTV __m128i
#endif
union AVX_Data {
#if defined(__AVX512__)
__m512 data;
#elif defined(__AVX256__)
__m256 data;
#endif
// float data_f[16];
};
template <int span>
inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) {
SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
}
}
template <int span>
inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) {
dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
}
}
template <int span>
inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) {
dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
}
}
template <int span>
inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) {
dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
}
}
template <int span>
inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) {
dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
}
}
template <int span>
inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
}
template <int span>
inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
}
template <int span>
inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
}
template <int span>
inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
}
template <int span>
inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
}
template <int span>
inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
{
#pragma unroll
for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
}
#endif
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
#include <fstream>
using namespace std;
template <typename T>
class Softmax {
public:
struct Config {
size_t batchSize;
size_t heads;
size_t seq_length;
size_t prob_depth;
float temperature;
bool mem_alloc;
Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
: batchSize(batch),
heads(h),
seq_length(seq),
prob_depth(prob_size),
temperature(1.0),
mem_alloc(mem_alloc)
{
}
};
Softmax(Config config) : config_(config) {}
~Softmax() {}
void Forward(int bsz, T* vals, const T* attn_mask, cudaStream_t& stream)
{
launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
}
void Backward(int bsz, T* out_grad, const T* soft_out, cudaStream_t stream)
{
launch_attn_softmax_backward_v2<T>(
out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
}
inline size_t GetProbDepth() const { return config_.prob_depth; }
inline size_t GetBatchSize() const { return config_.batchSize; }
inline size_t GetNumHeads() const { return config_.heads; }
inline size_t GetSeqLength() const { return config_.seq_length; }
inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
private:
Config config_;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
#include <fstream>
using namespace std;
template <typename T>
class Softmax {
public:
struct Config {
size_t batchSize;
size_t heads;
size_t seq_length;
size_t prob_depth;
float temperature;
bool mem_alloc;
Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
: batchSize(batch),
heads(h),
seq_length(seq),
prob_depth(prob_size),
temperature(1.0),
mem_alloc(mem_alloc)
{
}
};
Softmax(Config config) : config_(config) {}
~Softmax() {}
void Forward(int bsz, T* vals, const T* attn_mask, hipStream_t& stream)
{
launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
}
void Backward(int bsz, T* out_grad, const T* soft_out, hipStream_t stream)
{
launch_attn_softmax_backward_v2<T>(
out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
}
inline size_t GetProbDepth() const { return config_.prob_depth; }
inline size_t GetBatchSize() const { return config_.batchSize; }
inline size_t GetNumHeads() const { return config_.heads; }
inline size_t GetSeqLength() const { return config_.seq_length; }
inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
private:
Config config_;
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment