Commit 7d1a83a9 authored by aiss's avatar aiss
Browse files

push Deepspeed 0.6.3 rocm version

parent ab5534fc
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "gemm_test_hip.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline int DS_GET_BLOCKS(const int N)
{
return (std::max)(
(std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
// Use at least 1 block, since CUDA does not allow empty block
1);
}
class Context {
public:
Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
{
hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
auto message = std::string("Fail to create cublas handle.");
std::cerr << message << std::endl;
throw std::runtime_error(message);
}
}
virtual ~Context()
{
rocblas_destroy_handle(_cublasHandle);
hipFree(_workspace);
}
static Context& Instance()
{
static Context _ctx;
return _ctx;
}
void SetWorkSpace(void* workspace)
{
if (!workspace) { throw std::runtime_error("Workspace is null."); }
_workspace = workspace;
}
void* GetWorkSpace() { return _workspace; }
hiprandGenerator_t& GetRandGenerator() { return _gen; }
hipStream_t GetCurrentStream()
{
// get current pytorch stream.
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
return stream;
}
hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
rocblas_handle GetCublasHandle() { return _cublasHandle; }
std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
{
uint64_t offset = _curr_offset;
_curr_offset += offset_inc;
return std::pair<uint64_t, uint64_t>(_seed, offset);
}
void SetSeed(uint64_t new_seed) { _seed = new_seed; }
void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
{
// avoid rerun.
if (_gemm_algos.size() > 0) return;
if (test_gemm) {
rocblas_handle handle = GetCublasHandle();
std::unique_ptr<GemmTest<__half>> test_qkv_fw(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<GemmTest<__half>> test_inter(
new GemmTest<__half>(batch_size * seq_len, // M
4 * head_num * size_per_head, // N
head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<GemmTest<__half>> test_output(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
4 * head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
new StridedGemmTest<__half>(batch_size * head_num, // batch
seq_len, // M
seq_len, // N
size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
new StridedGemmTest<__half>(batch_size * head_num, // batch
size_per_head, // M
seq_len, // N
seq_len, // K
rocblas_operation_none,
rocblas_operation_none,
handle));
_gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
_gemm_algos.push_back(test_inter->TestAlgo(100));
_gemm_algos.push_back(test_output->TestAlgo(100));
_gemm_algos.push_back(test_attn_scores->TestAlgo(100));
_gemm_algos.push_back(test_attn_context->TestAlgo(100));
} else {
// Use default algo.
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
}
}
const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
private:
hiprandGenerator_t _gen;
rocblas_handle _cublasHandle;
void* _workspace;
uint64_t _seed;
uint64_t _curr_offset;
std::vector<std::array<int, 3>> _gemm_algos;
};
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "cuda.h"
#include "custom_cuda_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adagrad_Optimizer {
public:
Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
: _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
{
cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adagrad_Optimizer()
{
cudaFreeHost(_doubled_buffer[0]);
cudaFreeHost(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step)
{
_step++;
if (_step != step) { _step = step; }
}
inline void update_state(float lr, float epsilon, float weight_decay)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
}
private:
float _alpha;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float* _doubled_buffer[2];
bool _buf_index;
cudaStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
AVX_Data weight_decay4;
if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, grads + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_add<span>(grad_4, grad_4, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adagrad_Optimizer {
public:
Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
: _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
{
hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adagrad_Optimizer()
{
hipHostFree(_doubled_buffer[0]);
hipHostFree(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step)
{
_step++;
if (_step != step) { _step = step; }
}
inline void update_state(float lr, float epsilon, float weight_decay)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
}
private:
float _alpha;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float* _doubled_buffer[2];
bool _buf_index;
hipStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
AVX_Data weight_decay4;
if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, grads + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_add<span>(grad_4, grad_4, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
#pragma once #pragma once
#include <cpuid.h> #define NOMINMAX // Windows idiosyncrasy
#include <cuda_fp16.h> // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_runtime_api.h>
#include <stdio.h> #include <cuda_fp16.h>
#include <x86intrin.h> #include <cuda_runtime_api.h>
#include <cassert> #include <stdio.h>
#include "context.h" #include <cassert>
#include "cublas_v2.h" #include "cuda.h"
#include "cuda.h" #include "custom_cuda_layers.h"
#include "curand.h" #include "simd.h"
#define CUDA_CHECK(callstr) \ #define STEP(SPAN) \
{ \ void Step_##SPAN(float* _params, \
cudaError_t error_code = callstr; \ float* grads, \
if (error_code != cudaSuccess) { \ float* _exp_avg, \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ float* _exp_avg_sq, \
assert(0); \ size_t _param_size, \
} \ __half* dev_param = nullptr, \
} bool half_precision = false);
#define TILE (1024 * 1024 * 1024) class Adam_Optimizer {
public:
#if defined(__AVX512__) Adam_Optimizer(float alpha = 1e-3,
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) float betta1 = 0.9,
#define SIMD_LOAD(x) _mm512_loadu_ps(x) float betta2 = 0.999,
#define SIMD_SET(x) _mm512_set1_ps(x) float eps = 1e-8,
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y) float weight_decay = 0,
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c) bool adamw_mode = true)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x) : _alpha(alpha),
#define SIMD_DIV(x, y) _mm512_div_ps(x, y) _betta1(betta1),
#define SIMD_WIDTH 16 _betta2(betta2),
#else _eps(eps),
#if defined(__AVX256__) _weight_decay(weight_decay),
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d) _betta1_t(1.0),
#define SIMD_LOAD(x) _mm256_loadu_ps(x) _betta2_t(1.0),
#define SIMD_SET(x) _mm256_set1_ps(x) _step(0),
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y) _buf_index(false),
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c) _adamw_mode(adamw_mode)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x) {
#define SIMD_DIV(x, y) _mm256_div_ps(x, y) cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
#define SIMD_WIDTH 8 cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
#endif
#endif _streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
class Adam_Optimizer { }
public: ~Adam_Optimizer()
Adam_Optimizer(float alpha = 1e-3, {
float betta1 = 0.9, cudaFreeHost(_doubled_buffer[0]);
float betta2 = 0.999, cudaFreeHost(_doubled_buffer[1]);
float eps = 1e-8, }
float weight_decay = 0, #if defined(__AVX512__) or defined(__AVX256__)
bool adamw_mode = true) template <int span>
: _alpha(alpha), void Step_AVX(size_t* rounded_size,
_betta1(betta1), float* _params,
_betta2(betta2), float* grads,
_eps(eps), float* _exp_avg,
_weight_decay(weight_decay), float* _exp_avg_sq,
_betta1_t(1.0), size_t param_size,
_betta2_t(1.0), __half* dev_param = nullptr,
_step(0), bool half_precision = false);
_buf_index(false), #endif
_adamw_mode(adamw_mode) STEP(1)
{ STEP(4)
cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); STEP(8)
cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); inline void SynchronizeStreams()
{
_streams[0] = Context::Instance().GetCurrentStream(); for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
_streams[1] = Context::Instance().GetNewStream(); }
} inline void IncrementStep(size_t step, float beta1, float beta2)
~Adam_Optimizer() {
{ if (beta1 != _betta1 || beta2 != _betta2) {
cudaFreeHost(_doubled_buffer[0]); _step = step;
cudaFreeHost(_doubled_buffer[1]); _betta1 = beta1;
} _betta2 = beta2;
void Step(float* _params, _betta1_t = std::pow(_betta1, step);
float* grads, _betta2_t = std::pow(_betta2, step);
float* _exp_avg, } else {
float* _exp_avg_sq, _step++;
size_t param_size, if (_step != step) {
__half* dev_param = nullptr); _betta1_t = std::pow(_betta1, step);
void Step_4(float* _params, _betta2_t = std::pow(_betta2, step);
float* grads, _step = step;
float* _exp_avg, } else {
float* _exp_avg_sa, _betta1_t *= _betta1;
size_t param_size, _betta2_t *= _betta2;
__half* dev_param = nullptr); }
void Step_8(float* _params, }
float* grads, }
float* _exp_avg, inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
float* _exp_avg_sq, {
size_t _param_size, _alpha = lr;
__half* dev_params = nullptr); _eps = epsilon;
inline void SynchronizeStreams() _weight_decay = weight_decay;
{
for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]); _bias_correction1 = 1.0f;
} _bias_correction2 = 1.0f;
inline void IncrementStep(size_t step, float beta1, float beta2) if (bias_correction == 1) {
{ _bias_correction1 = 1 - _betta1_t;
if (beta1 != _betta1 || beta2 != _betta2) { _bias_correction2 = 1 / sqrt(1 - _betta2_t);
_step = step; }
_betta1 = beta1; }
_betta2 = beta2;
_betta1_t = std::pow(_betta1, step); private:
_betta2_t = std::pow(_betta2, step); float _alpha;
} else { float _betta1;
_step++; float _betta2;
if (_step != step) { float _eps;
_betta1_t = std::pow(_betta1, step); float _weight_decay;
_betta2_t = std::pow(_betta2, step);
_step = step; float _betta1_t;
} else { float _betta2_t;
_betta1_t *= _betta1; size_t _step;
_betta2_t *= _betta2;
} float _bias_correction1;
} float _bias_correction2;
}
inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction) float* _doubled_buffer[2];
{ bool _buf_index;
_alpha = lr; bool _adamw_mode;
_eps = epsilon;
_weight_decay = weight_decay; cudaStream_t _streams[2];
};
_bias_correction1 = 1.0f;
_bias_correction2 = 1.0f; #if defined(__AVX512__) or defined(__AVX256__)
if (bias_correction == 1) { template <int span>
_bias_correction1 = 1 - _betta1_t; void Adam_Optimizer::Step_AVX(size_t* rounded_size,
_bias_correction2 = 1 / sqrt(1 - _betta2_t); float* _params,
} float* grads,
} float* _exp_avg,
float* _exp_avg_sq,
private: size_t _param_size,
#if defined(__AVX512__) or defined(__AVX256__) __half* dev_params,
union AVX_Data { bool half_precision)
#if defined(__AVX512__) {
__m512 data; size_t new_rounded_size = 0;
#else
__m256 data; AVX_Data betta1_4;
#endif betta1_4.data = SIMD_SET(_betta1);
// float data_f[16]; AVX_Data betta2_4;
}; betta2_4.data = SIMD_SET(_betta2);
#endif
float betta1_minus1 = 1 - _betta1;
float _alpha; float betta2_minus1 = 1 - _betta2;
float _betta1; AVX_Data betta1_minus1_4;
float _betta2; betta1_minus1_4.data = SIMD_SET(betta1_minus1);
float _eps; AVX_Data betta2_minus1_4;
float _weight_decay; betta2_minus1_4.data = SIMD_SET(betta2_minus1);
float _betta1_t; AVX_Data bias2_sqrt;
float _betta2_t; bias2_sqrt.data = SIMD_SET(_bias_correction2);
size_t _step;
AVX_Data eps_4;
float _bias_correction1; eps_4.data = SIMD_SET(_eps);
float _bias_correction2;
float step_size = -1 * _alpha / _bias_correction1;
float* _doubled_buffer[2]; AVX_Data step_size_4;
bool _buf_index; step_size_4.data = SIMD_SET(step_size);
bool _adamw_mode;
float w_decay = -1 * _alpha * _weight_decay;
cudaStream_t _streams[2]; AVX_Data weight_decay4;
}; if (_weight_decay > 0)
weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, _exp_avg + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0 && !_adamw_mode) {
simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
}
simd_mul<span>(momentum_4, momentum_4, betta1_4);
simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
simd_mul<span>(variance_4, variance_4, betta2_4);
simd_mul<span>(grad_4, grad_4, grad_4);
simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
if (_weight_decay > 0 && _adamw_mode) {
simd_fma<span>(param_4, param_4, weight_decay4, param_4);
}
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg + i, momentum_4, false);
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adam_Optimizer {
public:
Adam_Optimizer(float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true)
: _alpha(alpha),
_betta1(betta1),
_betta2(betta2),
_eps(eps),
_weight_decay(weight_decay),
_betta1_t(1.0),
_betta2_t(1.0),
_step(0),
_buf_index(false),
_adamw_mode(adamw_mode)
{
hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adam_Optimizer()
{
hipHostFree(_doubled_buffer[0]);
hipHostFree(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step, float beta1, float beta2)
{
if (beta1 != _betta1 || beta2 != _betta2) {
_step = step;
_betta1 = beta1;
_betta2 = beta2;
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
} else {
_step++;
if (_step != step) {
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
_step = step;
} else {
_betta1_t *= _betta1;
_betta2_t *= _betta2;
}
}
}
inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
_bias_correction1 = 1.0f;
_bias_correction2 = 1.0f;
if (bias_correction == 1) {
_bias_correction1 = 1 - _betta1_t;
_bias_correction2 = 1 / sqrt(1 - _betta2_t);
}
}
private:
float _alpha;
float _betta1;
float _betta2;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float _bias_correction1;
float _bias_correction2;
float* _doubled_buffer[2];
bool _buf_index;
bool _adamw_mode;
hipStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adam_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data betta1_4;
betta1_4.data = SIMD_SET(_betta1);
AVX_Data betta2_4;
betta2_4.data = SIMD_SET(_betta2);
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
AVX_Data betta1_minus1_4;
betta1_minus1_4.data = SIMD_SET(betta1_minus1);
AVX_Data betta2_minus1_4;
betta2_minus1_4.data = SIMD_SET(betta2_minus1);
AVX_Data bias2_sqrt;
bias2_sqrt.data = SIMD_SET(_bias_correction2);
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha / _bias_correction1;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
float w_decay = -1 * _alpha * _weight_decay;
AVX_Data weight_decay4;
if (_weight_decay > 0)
weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, _exp_avg + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0 && !_adamw_mode) {
simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
}
simd_mul<span>(momentum_4, momentum_4, betta1_4);
simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
simd_mul<span>(variance_4, variance_4, betta2_4);
simd_mul<span>(grad_4, grad_4, grad_4);
simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
if (_weight_decay > 0 && _adamw_mode) {
simd_fma<span>(param_4, param_4, weight_decay4, param_4);
}
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg + i, momentum_4, false);
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
...@@ -5,7 +5,9 @@ ...@@ -5,7 +5,9 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h> #include <mma.h>
#endif
#include <stdio.h> #include <stdio.h>
int cublas_gemm_ex(cublasHandle_t handle, int cublas_gemm_ex(cublasHandle_t handle,
...@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle, ...@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
const float* A, const float* A,
const float* B, const float* B,
float* C, float* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_gemm_ex(cublasHandle_t handle, int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transa,
...@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle, ...@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
const __half* A, const __half* A,
const __half* B, const __half* B,
__half* C, __half* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle, int cublas_strided_batched_gemm(cublasHandle_t handle,
int m, int m,
...@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, ...@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B, int stride_B,
int stride_C, int stride_C,
int batch, int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle, int cublas_strided_batched_gemm(cublasHandle_t handle,
int m, int m,
...@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle, ...@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B, int stride_B,
int stride_C, int stride_C,
int batch, int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
...@@ -5,12 +5,29 @@ ...@@ -5,12 +5,29 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h> #include <cooperative_groups.h>
#endif
#include <curand_kernel.h> #include <curand_kernel.h>
#include "context.h" #include "context.h"
#include "cublas_wrappers.h" #include "cublas_wrappers.h"
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024 #define MAX_THREADS 1024
#define THREADS 256 #define THREADS 256
...@@ -24,6 +41,34 @@ ...@@ -24,6 +41,34 @@
#define MAX_REGISTERS 256 #define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template <typename T>
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
// Fused bias add with gelu activation // Fused bias add with gelu activation
template <typename T> template <typename T>
void launch_bias_gelu(const T* input, void launch_bias_gelu(const T* input,
...@@ -255,3 +300,4 @@ void launch_fuse_transpose_bias_kernel(const T* inp, ...@@ -255,3 +300,4 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
cudaStream_t stream); cudaStream_t stream);
void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream); void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template <typename T>
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
template <typename T>
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
hipStream_t stream);
// Fused bias add with gelu activation
template <typename T>
void launch_bias_gelu(const T* input,
const T* bias,
T* output,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_gelu(const T* input,
T* output,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_d_gelu(T* d_output,
const T* input,
const T* bias,
int intermediate_size,
int batch_size,
hipStream_t stream);
// Custom fused bias add with layer normalization
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
hipStream_t stream,
bool preLayerNorm,
bool training,
T* vars,
T* means);
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
hipStream_t stream,
bool preLayerNorm,
bool training,
T* vars);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward_nreversible(const T* out_grad,
const T* vals,
const T* out_grad_trans,
const T* vals_trans,
const T* means,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
hipStream_t stream[2]);
template <typename T>
void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, hipStream_t stream);
template <typename T>
void launch_attn_softmax_backward(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
hipStream_t stream);
template <typename T>
void launch_attn_softmax_backward_v2(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
hipStream_t stream);
// Custom softmax with scaling and attention mask addition
template <typename T>
void launch_attn_softmax(T* vals,
const T* attn_mask,
int batch_size,
int heads,
int sequence_length,
hipStream_t stream);
template <typename T>
void launch_transform_0213(T* output,
const T* vals,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
hipStream_t stream);
// Custom bias add
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
const T* vals,
const T* bias,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
hipStream_t stream,
int trans_count);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
hipStream_t stream,
int trans_count);
template <typename T>
void launch_dropout(T* vals,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
hipStream_t stream);
template <typename T>
void launch_dropout(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
int dim,
float ratio,
hipStream_t stream,
bool bwd = false);
template <typename T>
void launch_dropout(T* out,
const T* vals,
const T* residual,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
hipStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
float ratio,
hipStream_t stream);
template <typename T>
void launch_fuse_transpose_bias_kernel(const T* inp,
T* out,
int rows,
int cols,
hipStream_t stream);
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream);
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream);
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <stdio.h> #include <stdio.h>
template <typename T> template <typename T>
class Dropout { class Dropout {
public: public:
struct Config { struct Config {
float ratio; float ratio;
uint32_t dim; uint32_t dim;
bool training; bool training;
Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {} Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
float RATIO() const { return training ? ratio : 0.0; } float RATIO() const { return training ? ratio : 0.0; }
inline void SetDim(uint32_t d) { dim = d; } inline void SetDim(uint32_t d) { dim = d; }
}; };
Dropout(const Config& config) : _config(config), _mask(nullptr) {} Dropout(const Config& config) : _config(config), _mask(nullptr) {}
virtual ~Dropout() {} virtual ~Dropout() {}
void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false) void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
{ {
launch_dropout<T>( launch_dropout<T>(
out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd); out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
} }
void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream) void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
{ {
launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream); launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
} }
void ForwardWithBias(int bsz, void ForwardWithBias(int bsz,
T* out, T* out,
const T* vals, const T* vals,
const T* residual, const T* residual,
const T* bias, const T* bias,
cudaStream_t stream) cudaStream_t stream)
{ {
launch_dropout<T>( launch_dropout<T>(
out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream); out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
} }
void Backward(int bsz, T* d_vals, cudaStream_t stream) void Backward(int bsz, T* d_vals, cudaStream_t stream)
{ {
launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream); launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
} }
void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream) void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
{ {
launch_dropout_grad<T>( launch_dropout_grad<T>(
d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream); d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
} }
bool HasDropout() const { return _config.RATIO() > 0.0; } bool HasDropout() const { return _config.RATIO() > 0.0; }
void SetTrainingMode(bool training) { _config.training = training; } void SetTrainingMode(bool training) { _config.training = training; }
void SetMask(uint8_t* mask) void SetMask(uint8_t* mask)
{ {
if (!mask) { throw std::runtime_error("Dropout mask is null."); } if (!mask) { throw std::runtime_error("Dropout mask is null."); }
_mask = mask; _mask = mask;
} }
Config GetConfig() const { return _config; } Config GetConfig() const { return _config; }
inline void SetDimension(uint32_t dim) { _config.SetDim(dim); } inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
private: private:
uint8_t* _mask; uint8_t* _mask;
Config _config; Config _config;
}; };
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template <typename T>
class Dropout {
public:
struct Config {
float ratio;
uint32_t dim;
bool training;
Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
float RATIO() const { return training ? ratio : 0.0; }
inline void SetDim(uint32_t d) { dim = d; }
};
Dropout(const Config& config) : _config(config), _mask(nullptr) {}
virtual ~Dropout() {}
void Forward(int bsz, T* out, const T* vals, hipStream_t stream, bool bwd = false)
{
launch_dropout<T>(
out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
}
void ForwardWithBias(int bsz, T* vals, const T* bias, hipStream_t stream)
{
launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void ForwardWithBias(int bsz,
T* out,
const T* vals,
const T* residual,
const T* bias,
hipStream_t stream)
{
launch_dropout<T>(
out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals, hipStream_t stream)
{
launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
void Backward(int bsz, T* d_vals_out, const T* d_vals, hipStream_t stream)
{
launch_dropout_grad<T>(
d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
}
bool HasDropout() const { return _config.RATIO() > 0.0; }
void SetTrainingMode(bool training) { _config.training = training; }
void SetMask(uint8_t* mask)
{
if (!mask) { throw std::runtime_error("Dropout mask is null."); }
_mask = mask;
}
Config GetConfig() const { return _config; }
inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
private:
uint8_t* _mask;
Config _config;
};
...@@ -34,12 +34,12 @@ struct BertGemmAlgos { ...@@ -34,12 +34,12 @@ struct BertGemmAlgos {
template <typename T> template <typename T>
class BertTransformerLayer { class BertTransformerLayer {
public: public:
BertTransformerLayer(int layer_id, BertTransformerLayer(unsigned layer_id,
int batch_size, unsigned batch_size,
int hidden_size, unsigned hidden_size,
int num_heads, unsigned num_heads,
int intermediate_size, unsigned intermediate_size,
int seq_length, unsigned seq_length,
float attn_dropout_ratio, float attn_dropout_ratio,
float hidden_output_dropout_ratio, float hidden_output_dropout_ratio,
float layer_norm_eps, float layer_norm_eps,
...@@ -52,7 +52,7 @@ public: ...@@ -52,7 +52,7 @@ public:
virtual ~BertTransformerLayer(); virtual ~BertTransformerLayer();
void Forward(int bsz, void Forward(unsigned bsz,
const T* input_ptr, const T* input_ptr,
const T* input_mask_ptr, const T* input_mask_ptr,
const T* attn_qkvw_ptr, const T* attn_qkvw_ptr,
...@@ -80,7 +80,7 @@ public: ...@@ -80,7 +80,7 @@ public:
T* gelu_inp_ptr, T* gelu_inp_ptr,
T* ff2_inp_ptr); T* ff2_inp_ptr);
void Backward(int bsz, void Backward(unsigned bsz,
const T* grad_output_ptr, const T* grad_output_ptr,
const T* input_ptr, const T* input_ptr,
const T* output_ptr, const T* output_ptr,
...@@ -128,13 +128,13 @@ public: ...@@ -128,13 +128,13 @@ public:
T* attn_layer_norm_var, T* attn_layer_norm_var,
T* attn_layer_norm_mean); T* attn_layer_norm_mean);
inline int GetBatchSize() const { return _batch_size; } inline unsigned GetBatchSize() const { return _batch_size; }
inline int GetNumHeads() const { return _heads; } inline unsigned GetNumHeads() const { return _heads; }
inline int GetSeqLength() const { return _seq_length; } inline unsigned GetSeqLength() const { return _seq_length; }
inline int GetIntermediateSize() const { return _intermediate_size; } inline unsigned GetIntermediateSize() const { return _intermediate_size; }
void SetSeqLength(int seq_len); void SetSeqLength(unsigned seq_len);
inline int GetHiddenSize() const { return _hidden_size; } inline unsigned GetHiddenSize() const { return _hidden_size; }
void SetTrainingMode(bool training); void SetTrainingMode(bool training);
inline bool IsTrainingMode() const { return _training; } inline bool IsTrainingMode() const { return _training; }
inline bool GeluCheckpoint() const { return _gelu_checkpoint; } inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
...@@ -144,13 +144,13 @@ private: ...@@ -144,13 +144,13 @@ private:
size_t getWorkspaceSize(int maxBatchSize) const; size_t getWorkspaceSize(int maxBatchSize) const;
// Params // Params
int _layer_id; unsigned _layer_id;
int _batch_size; unsigned _batch_size;
int _hidden_size; unsigned _hidden_size;
int _heads; unsigned _heads;
int _size_per_head; unsigned _size_per_head;
int _intermediate_size; unsigned _intermediate_size;
int _seq_length; unsigned _seq_length;
bool _pre_or_postLayerNorm; bool _pre_or_postLayerNorm;
...@@ -179,6 +179,6 @@ private: ...@@ -179,6 +179,6 @@ private:
bool _normalize_invertible; bool _normalize_invertible;
bool _gelu_checkpoint; bool _gelu_checkpoint;
// High Performace flags // High Performance flags
bool _stochastic_mode; bool _stochastic_mode;
}; };
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand/hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout_hip.h"
#include "feed_forward_hip.h"
#include "gelu_hip.h"
#include "general_kernels_hip.h"
#include "normalize_layer_hip.h"
#include "softmax_hip.h"
#include "strided_batch_gemm_hip.h"
struct BertGemmAlgos {
int m_gemm_qkv_algo;
int m_gemm_inter_algo;
int m_gemm_output_algo;
int m_gemm_batch1_algo;
int m_gemm_batch2_algo;
BertGemmAlgos()
: m_gemm_qkv_algo(-1),
m_gemm_inter_algo(-1),
m_gemm_output_algo(-1),
m_gemm_batch1_algo(-1),
m_gemm_batch2_algo(-1)
{
}
};
template <typename T>
class BertTransformerLayer {
public:
BertTransformerLayer(unsigned layer_id,
unsigned batch_size,
unsigned hidden_size,
unsigned num_heads,
unsigned intermediate_size,
unsigned seq_length,
float attn_dropout_ratio,
float hidden_output_dropout_ratio,
float layer_norm_eps,
bool pre_or_postLayerNorm,
const std::vector<std::array<int, 3>>& gemm_algos,
bool attn_dropout_checkpoint,
bool normalize_invertible,
bool gelu_checkpoint,
bool stochastic_mode);
virtual ~BertTransformerLayer();
void Forward(unsigned bsz,
const T* input_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_qkvb_ptr,
const T* attn_ow_ptr,
const T* attn_ob_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* output_b_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* out_ptr,
T* inp_norm_ptr,
T* q_tf_ptr,
T* k_tf_ptr,
T* v_tf_ptr,
T* softmax_output_ptr,
T* ctx_bufB_ptr,
T* attn_o_inp_ptr,
T* add_res_ptr,
T* ff1_inp_ptr,
T* gelu_inp_ptr,
T* ff2_inp_ptr);
void Backward(unsigned bsz,
const T* grad_output_ptr,
const T* input_ptr,
const T* output_ptr,
const T* inp_norm_ptr,
const T* q_tf_ptr,
const T* k_tf_ptr,
const T* v_tf_ptr,
const T* softmax_output_ptr,
const T* ctx_bufB_ptr,
const T* attn_o_inp_ptr,
const T* add_res_ptr,
const T* ff1_inp_ptr,
const T* gelu_inp_ptr,
const T* ff2_inp_ptr,
const T* input_mask_ptr,
const T* attn_qkvw_ptr,
const T* attn_ow_ptr,
const T* attn_nw_ptr,
const T* attn_nb_ptr,
const T* inter_w_ptr,
const T* inter_b_ptr,
const T* output_w_ptr,
const T* norm_w_ptr,
const T* norm_b_ptr,
T* grad_input_ptr,
T* grad_attn_qkvw_ptr,
T* grad_attn_qkvb_ptr,
T* grad_attn_ow_ptr,
T* grad_attn_ob_ptr,
T* grad_attn_nw_ptr,
T* grad_attn_nb_ptr,
T* grad_inter_w_ptr,
T* grad_inter_b_ptr,
T* grad_output_w_ptr,
T* grad_output_b_ptr,
T* grad_norm_w_ptr,
T* grad_norm_b_ptr);
void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
uint8_t* attn_output_dropout_mask_ptr,
uint8_t* layer_output_dropout_mask_ptr,
T* layer_norm_var,
T* layer_norm_mean,
T* attn_layer_norm_var,
T* attn_layer_norm_mean);
inline unsigned GetBatchSize() const { return _batch_size; }
inline unsigned GetNumHeads() const { return _heads; }
inline unsigned GetSeqLength() const { return _seq_length; }
inline unsigned GetIntermediateSize() const { return _intermediate_size; }
void SetSeqLength(unsigned seq_len);
inline unsigned GetHiddenSize() const { return _hidden_size; }
void SetTrainingMode(bool training);
inline bool IsTrainingMode() const { return _training; }
inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
private:
void Initialize();
size_t getWorkspaceSize(int maxBatchSize) const;
// Params
unsigned _layer_id;
unsigned _batch_size;
unsigned _hidden_size;
unsigned _heads;
unsigned _size_per_head;
unsigned _intermediate_size;
unsigned _seq_length;
bool _pre_or_postLayerNorm;
rocblas_handle _cublasHandle;
hipStream_t _stream;
// layers
FeedForward<T> _qkv_linear;
FeedForward<T> _attn_out_linear;
Normalize_Layer<T> _attn_layer_norm;
Normalize_Layer<T> _layer_norm;
Normalize_Layer<T>* _last_normalize;
FeedForward<T> _ff1, _ff2;
Softmax<T> _softmax;
Gelu<T> _gelu;
Dropout<T> _attn_prob_dropout;
Dropout<T> _attn_output_dropout;
Dropout<T> _layer_output_dropout;
StridedBatchGemm<T> _attn_scores;
StridedBatchGemm<T> _attn_context;
bool _training;
// Memory saving flags
bool _attn_dropout_checkpoint;
bool _normalize_invertible;
bool _gelu_checkpoint;
// High Performance flags
bool _stochastic_mode;
};
#ifndef __FEEDFORWARD_H__ #ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__ #define __FEEDFORWARD_H__
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <stdio.h> #include <stdio.h>
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
template <typename T> template <typename T>
class FeedForward { class FeedForward {
public: public:
struct Config { struct Config {
int batchSize, outputSize; int batchSize, outputSize;
int inputSize; int inputSize;
std::array<int, 3> gemm_algos; std::array<int, 3> gemm_algos;
Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos) Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
: batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos) : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
{ {
} }
}; };
FeedForward(Config config) : config_(config) {} FeedForward(Config config) : config_(config) {}
~FeedForward() {} ~FeedForward() {}
void Forward(int bsz, void Forward(int bsz,
const T* input_ptr, const T* input_ptr,
const T* weights, const T* weights,
T* out, T* out,
cublasHandle_t& _cublasHandle) cublasHandle_t& _cublasHandle)
{ {
float alpha = T(1.); float alpha = T(1.);
float beta = T(0.); float beta = T(0.);
cublas_gemm_ex(_cublasHandle, cublas_gemm_ex(_cublasHandle,
CUBLAS_OP_T, CUBLAS_OP_T,
CUBLAS_OP_N, CUBLAS_OP_N,
config_.outputSize, config_.outputSize,
bsz, bsz,
config_.inputSize, config_.inputSize,
&alpha, &alpha,
&beta, &beta,
weights, weights,
input_ptr, input_ptr,
out, out,
cublasGemmAlgo_t(config_.gemm_algos[0])); #ifdef __HIP_PLATFORM_HCC__
} rocblas_gemm_algo(config_.gemm_algos[0]));
void Backward(int bsz, #else
const T* out_grad, cublasGemmAlgo_t(config_.gemm_algos[0]));
const T* input_ptr, #endif
const T* weights, }
T* weights_grad, void Backward(int bsz,
T* bias_grad, const T* out_grad,
cublasHandle_t& _cublasHandle, const T* input_ptr,
cudaStream_t& stream, const T* weights,
T* inp_grad_out = nullptr, T* weights_grad,
T* out_grad_trans_out = nullptr) T* bias_grad,
{ cublasHandle_t& _cublasHandle,
float alpha = (T)1.0, beta = (T)0.0; cudaStream_t& stream,
cublas_gemm_ex(_cublasHandle, T* inp_grad_out = nullptr,
CUBLAS_OP_N, T* out_grad_trans_out = nullptr)
CUBLAS_OP_T, {
config_.inputSize, float alpha = (T)1.0, beta = (T)0.0;
config_.outputSize, cublas_gemm_ex(_cublasHandle,
bsz, CUBLAS_OP_N,
&alpha, CUBLAS_OP_T,
&beta, config_.inputSize,
input_ptr, config_.outputSize,
out_grad, bsz,
weights_grad, &alpha,
cublasGemmAlgo_t(config_.gemm_algos[1])); &beta,
input_ptr,
cublas_gemm_ex(_cublasHandle, out_grad,
CUBLAS_OP_N, weights_grad,
CUBLAS_OP_N, #ifdef __HIP_PLATFORM_HCC__
config_.inputSize, rocblas_gemm_algo(config_.gemm_algos[1]));
bsz, #else
config_.outputSize, cublasGemmAlgo_t(config_.gemm_algos[1]));
&alpha, #endif
&beta,
weights, cublas_gemm_ex(_cublasHandle,
out_grad, CUBLAS_OP_N,
inp_grad_out, CUBLAS_OP_N,
cublasGemmAlgo_t(config_.gemm_algos[2])); config_.inputSize,
bsz,
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream); config_.outputSize,
} &alpha,
&beta,
private: weights,
Config config_; out_grad,
}; inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
#endif rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
#endif
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
}
private:
Config config_;
};
#endif
// !!! This is a file automatically generated by hipify!!!
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template <typename T>
class FeedForward {
public:
struct Config {
int batchSize, outputSize;
int inputSize;
std::array<int, 3> gemm_algos;
Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
: batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
{
}
};
FeedForward(Config config) : config_(config) {}
~FeedForward() {}
void Forward(int bsz,
const T* input_ptr,
const T* weights,
T* out,
rocblas_handle& _cublasHandle)
{
float alpha = T(1.);
float beta = T(0.);
cublas_gemm_ex(_cublasHandle,
rocblas_operation_transpose,
rocblas_operation_none,
config_.outputSize,
bsz,
config_.inputSize,
&alpha,
&beta,
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
#endif
}
void Backward(int bsz,
const T* out_grad,
const T* input_ptr,
const T* weights,
T* weights_grad,
T* bias_grad,
rocblas_handle& _cublasHandle,
hipStream_t& stream,
T* inp_grad_out = nullptr,
T* out_grad_trans_out = nullptr)
{
float alpha = (T)1.0, beta = (T)0.0;
cublas_gemm_ex(_cublasHandle,
rocblas_operation_none,
rocblas_operation_transpose,
config_.inputSize,
config_.outputSize,
bsz,
&alpha,
&beta,
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
#endif
cublas_gemm_ex(_cublasHandle,
rocblas_operation_none,
rocblas_operation_none,
config_.inputSize,
bsz,
config_.outputSize,
&alpha,
&beta,
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
#endif
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
}
private:
Config config_;
};
#endif
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <stdio.h> #include <stdio.h>
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
template <typename T> template <typename T>
class Gelu { class Gelu {
public: public:
struct Config { struct Config {
uint32_t intermediate_size; uint32_t intermediate_size;
Config(uint32_t inter_size) : intermediate_size(inter_size) {} Config(uint32_t inter_size) : intermediate_size(inter_size) {}
}; };
Gelu(const Config& config) : _config(config) {} Gelu(const Config& config) : _config(config) {}
virtual ~Gelu() {} virtual ~Gelu() {}
void ForwardWithBiasAdd(int bsz, void ForwardWithBiasAdd(int bsz,
const T* input_buf, const T* input_buf,
const T* bias, const T* bias,
T* output, T* output,
cudaStream_t stream) cudaStream_t stream)
{ {
launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream); launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
} }
void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream) void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
{ {
launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream); launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
} }
private: private:
Config _config; Config _config;
}; };
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template <typename T>
class Gelu {
public:
struct Config {
uint32_t intermediate_size;
Config(uint32_t inter_size) : intermediate_size(inter_size) {}
};
Gelu(const Config& config) : _config(config) {}
virtual ~Gelu() {}
void ForwardWithBiasAdd(int bsz,
const T* input_buf,
const T* bias,
T* output,
hipStream_t stream)
{
launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
}
void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
{
launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
}
private:
Config _config;
};
#pragma once #pragma once
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #ifndef __HIP_PLATFORM_HCC__
#include <array> #include <cuda_profiler_api.h>
#include <cstdio> #endif
#include <cstdlib> #include <array>
#include <ctime> #include <cstdio>
#include <limits> #include <cstdlib>
#include <memory> #include <ctime>
#include "StopWatch.h" #include <limits>
#include "cublas_wrappers.h" #include <memory>
#include "StopWatch.h"
template <typename T> #include "cublas_wrappers.h"
void check(T result, char const* const func, const char* const file, int const line)
{ template <typename T>
if (result) { void check(T result, char const* const func, const char* const file, int const line)
std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) + {
" \n"); if (result) {
} std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
} " \n");
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) }
template <typename T> #define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
class GemmTest {
public: template <typename T>
GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h) class GemmTest {
: M(m), N(n), K(k), transa(ta), transb(tb), handle(h) public:
{ GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K)); : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N)); {
check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N)); check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
} check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
~GemmTest() }
{
check_cuda_error(cudaFree(A)); ~GemmTest()
check_cuda_error(cudaFree(B)); {
check_cuda_error(cudaFree(C)); check_cuda_error(cudaFree(A));
} check_cuda_error(cudaFree(B));
check_cuda_error(cudaFree(C));
std::array<int, 3> TestAlgo(int loops) }
{
float alpha = (T)1.0f; std::array<int, 3> TestAlgo(int loops)
float beta = (T)0.0f; {
float alpha = (T)1.0f;
int algo_fw = Run(loops, [=](int algo) { float beta = (T)0.0f;
cublas_gemm_ex(handle,
CUBLAS_OP_T, int algo_fw = Run(loops, [=](int algo) {
CUBLAS_OP_N, cublas_gemm_ex(handle,
N, CUBLAS_OP_T,
M, CUBLAS_OP_N,
K, N,
&alpha, M,
&beta, K,
B, &alpha,
A, &beta,
C, B,
static_cast<cublasGemmAlgo_t>(algo)); A,
}); C,
#ifdef __HIP_PLATFORM_HCC__
int algo_bw1 = Run(loops, [=](int algo) { static_cast<rocblas_gemm_algo>(algo));
cublas_gemm_ex(handle, #else
CUBLAS_OP_N, static_cast<cublasGemmAlgo_t>(algo));
CUBLAS_OP_T, #endif
K, });
N,
M, int algo_bw1 = Run(loops, [=](int algo) {
&alpha, cublas_gemm_ex(handle,
&beta, CUBLAS_OP_N,
A, CUBLAS_OP_T,
C, K,
B, N,
static_cast<cublasGemmAlgo_t>(algo)); M,
}); &alpha,
&beta,
int algo_bw2 = Run(loops, [=](int algo) { A,
cublas_gemm_ex(handle, C,
CUBLAS_OP_N, B,
CUBLAS_OP_N, #ifdef __HIP_PLATFORM_HCC__
K, static_cast<rocblas_gemm_algo>(algo));
M, #else
N, static_cast<cublasGemmAlgo_t>(algo));
&alpha, #endif
&beta, });
B,
C, int algo_bw2 = Run(loops, [=](int algo) {
A, cublas_gemm_ex(handle,
static_cast<cublasGemmAlgo_t>(algo)); CUBLAS_OP_N,
}); CUBLAS_OP_N,
K,
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2}); M,
} N,
&alpha,
template <typename Func> &beta,
int Run(int loops, Func f) B,
{ C,
float fast_latency = (std::numeric_limits<float>::max)(); A,
int fast_algo = 0; #ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; #else
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; static_cast<cublasGemmAlgo_t>(algo));
algo++) { #endif
int warm_up = 5; });
for (int i = 0; i < warm_up; ++i) f(algo);
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
cudaDeviceSynchronize(); }
Stopwatch timer;
timer.Restart(); template <typename Func>
int Run(int loops, Func f)
for (int i = 0; i < loops; ++i) f(algo); {
float fast_latency = (std::numeric_limits<float>::max)();
cudaDeviceSynchronize(); int fast_algo = 0;
timer.Stop();
#ifdef __HIP_PLATFORM_HCC__
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops; for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
printf("algo-%d: %.3fms\n", algo, avg_latency); for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
if (avg_latency < fast_latency) { #endif
fast_latency = avg_latency; algo++) {
fast_algo = algo; int warm_up = 5;
} for (int i = 0; i < warm_up; ++i) f(algo);
}
cudaDeviceSynchronize();
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency); Stopwatch timer;
timer.Restart();
return fast_algo;
} for (int i = 0; i < loops; ++i) f(algo);
private: cudaDeviceSynchronize();
int M, N, K; timer.Stop();
cublasHandle_t handle;
cublasOperation_t transa, transb; float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
T *A, *B, *C;
}; printf("algo-%d: %.3fms\n", algo, avg_latency);
template <typename T> if (avg_latency < fast_latency) {
class StridedGemmTest { fast_latency = avg_latency;
public: fast_algo = algo;
StridedGemmTest(int b, }
int m, }
int n,
int k, printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
cublasOperation_t ta,
cublasOperation_t tb, return fast_algo;
cublasHandle_t h) }
: bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{ private:
check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz)); int M, N, K;
check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz)); cublasHandle_t handle;
check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz)); cublasOperation_t transa, transb;
} T *A, *B, *C;
};
~StridedGemmTest()
{ template <typename T>
check_cuda_error(cudaFree(A)); class StridedGemmTest {
check_cuda_error(cudaFree(B)); public:
check_cuda_error(cudaFree(C)); StridedGemmTest(int b,
} int m,
int n,
std::array<int, 3> TestAlgo(int loops) int k,
{ cublasOperation_t ta,
float alpha = (T)1.0f; cublasOperation_t tb,
float beta = (T)0.0f; cublasHandle_t h)
: bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
int algo_fw = Run(loops, [=](int algo) { {
int stride_a = M * K; check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
int stride_b = N * K; check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
int stride_c = M * N; check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
}
cublas_strided_batched_gemm(handle,
M, ~StridedGemmTest()
N, {
K, check_cuda_error(cudaFree(A));
&alpha, check_cuda_error(cudaFree(B));
&beta, check_cuda_error(cudaFree(C));
A, }
B,
C, std::array<int, 3> TestAlgo(int loops)
transa, {
transb, float alpha = (T)1.0f;
stride_a, float beta = (T)0.0f;
stride_b,
stride_c, int algo_fw = Run(loops, [=](int algo) {
bsz, int stride_a = M * K;
static_cast<cublasGemmAlgo_t>(algo)); int stride_b = N * K;
}); int stride_c = M * N;
int algo_bw1 = Run(loops, [=](int algo) { cublas_strided_batched_gemm(handle,
int mb = (transa == CUBLAS_OP_T ? K : M); M,
int kb = (transa == CUBLAS_OP_T ? M : K); N,
K,
int stride_a = mb * N; &alpha,
int stride_b = N * kb; &beta,
int stride_c = M * K; A,
B,
// B need to transpose. C,
cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T); transa,
transb,
// Calculate d_A. stride_a,
cublas_strided_batched_gemm(handle, stride_b,
mb, stride_c,
kb, bsz,
N, #ifdef __HIP_PLATFORM_HCC__
&alpha, static_cast<rocblas_gemm_algo>(algo));
&beta, #else
(transa == CUBLAS_OP_T ? B : C), static_cast<cublasGemmAlgo_t>(algo));
(transa == CUBLAS_OP_T ? C : B), #endif
A, });
CUBLAS_OP_N,
op_b, int algo_bw1 = Run(loops, [=](int algo) {
stride_a, int mb = (transa == CUBLAS_OP_T ? K : M);
stride_b, int kb = (transa == CUBLAS_OP_T ? M : K);
stride_c,
bsz, int stride_a = mb * N;
static_cast<cublasGemmAlgo_t>(algo)); int stride_b = N * kb;
}); int stride_c = M * K;
int algo_bw2 = Run(loops, [=](int algo) { // B need to transpose.
// A need to transpose. cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
// Calculate d_A.
int stride_a = M * K; cublas_strided_batched_gemm(handle,
int stride_b = M * N; mb,
int stride_c = N * K; kb,
N,
// Calculate d_B. &alpha,
cublas_strided_batched_gemm(handle, &beta,
K, (transa == CUBLAS_OP_T ? B : C),
N, (transa == CUBLAS_OP_T ? C : B),
M, A,
&alpha, CUBLAS_OP_N,
&beta, op_b,
A, stride_a,
C, stride_b,
B, stride_c,
op_a, bsz,
CUBLAS_OP_N, #ifdef __HIP_PLATFORM_HCC__
stride_a, static_cast<rocblas_gemm_algo>(algo));
stride_b, #else
stride_c, static_cast<cublasGemmAlgo_t>(algo));
bsz, #endif
static_cast<cublasGemmAlgo_t>(algo)); });
});
int algo_bw2 = Run(loops, [=](int algo) {
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2}); // A need to transpose.
} cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
template <typename Func> int stride_a = M * K;
int Run(int loops, Func f) int stride_b = M * N;
{ int stride_c = N * K;
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0; // Calculate d_B.
cublas_strided_batched_gemm(handle,
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; K,
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; N,
algo++) { M,
int warm_up = 5; &alpha,
for (int i = 0; i < warm_up; ++i) f(algo); &beta,
A,
cudaDeviceSynchronize(); C,
Stopwatch timer; B,
timer.Restart(); op_a,
CUBLAS_OP_N,
for (int i = 0; i < loops; ++i) f(algo); stride_a,
stride_b,
cudaDeviceSynchronize(); stride_c,
timer.Stop(); bsz,
#ifdef __HIP_PLATFORM_HCC__
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops; static_cast<rocblas_gemm_algo>(algo));
#else
printf("algo-%d: %.3fms\n", algo, avg_latency); static_cast<cublasGemmAlgo_t>(algo));
#endif
if (avg_latency < fast_latency) { });
fast_latency = avg_latency;
fast_algo = algo; return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
} }
}
template <typename Func>
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency); int Run(int loops, Func f)
{
return fast_algo; float fast_latency = (std::numeric_limits<float>::max)();
} int fast_algo = 0;
private: #ifdef __HIP_PLATFORM_HCC__
int bsz, M, N, K; for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
cublasHandle_t handle; #else
cublasOperation_t transa, transb; for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
T *A, *B, *C; algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}; #endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
cudaDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
cudaDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int bsz, M, N, K;
cublasHandle_t handle;
cublasOperation_t transa, transb;
T *A, *B, *C;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers_hip.h"
template <typename T>
void check(T result, char const* const func, const char* const file, int const line)
{
if (result) {
std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
" \n");
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
class GemmTest {
public:
GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
: M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
}
~GemmTest()
{
check_cuda_error(hipFree(A));
check_cuda_error(hipFree(B));
check_cuda_error(hipFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_transpose,
rocblas_operation_none,
N,
M,
K,
&alpha,
&beta,
B,
A,
C,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_none,
rocblas_operation_transpose,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
cublas_gemm_ex(handle,
rocblas_operation_none,
rocblas_operation_none,
K,
M,
N,
&alpha,
&beta,
B,
C,
A,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
hipDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
hipDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int M, N, K;
rocblas_handle handle;
rocblas_operation transa, transb;
T *A, *B, *C;
};
template <typename T>
class StridedGemmTest {
public:
StridedGemmTest(int b,
int m,
int n,
int k,
rocblas_operation ta,
rocblas_operation tb,
rocblas_handle h)
: bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
{
check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
}
~StridedGemmTest()
{
check_cuda_error(hipFree(A));
check_cuda_error(hipFree(B));
check_cuda_error(hipFree(C));
}
std::array<int, 3> TestAlgo(int loops)
{
float alpha = (T)1.0f;
float beta = (T)0.0f;
int algo_fw = Run(loops, [=](int algo) {
int stride_a = M * K;
int stride_b = N * K;
int stride_c = M * N;
cublas_strided_batched_gemm(handle,
M,
N,
K,
&alpha,
&beta,
A,
B,
C,
transa,
transb,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw1 = Run(loops, [=](int algo) {
int mb = (transa == rocblas_operation_transpose ? K : M);
int kb = (transa == rocblas_operation_transpose ? M : K);
int stride_a = mb * N;
int stride_b = N * kb;
int stride_c = M * K;
// B need to transpose.
rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
// Calculate d_A.
cublas_strided_batched_gemm(handle,
mb,
kb,
N,
&alpha,
&beta,
(transa == rocblas_operation_transpose ? B : C),
(transa == rocblas_operation_transpose ? C : B),
A,
rocblas_operation_none,
op_b,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
int algo_bw2 = Run(loops, [=](int algo) {
// A need to transpose.
rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
int stride_a = M * K;
int stride_b = M * N;
int stride_c = N * K;
// Calculate d_B.
cublas_strided_batched_gemm(handle,
K,
N,
M,
&alpha,
&beta,
A,
C,
B,
op_a,
rocblas_operation_none,
stride_a,
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_HCC__
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
#endif
});
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
}
template <typename Func>
int Run(int loops, Func f)
{
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;
#ifdef __HIP_PLATFORM_HCC__
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
algo++) {
int warm_up = 5;
for (int i = 0; i < warm_up; ++i) f(algo);
hipDeviceSynchronize();
Stopwatch timer;
timer.Restart();
for (int i = 0; i < loops; ++i) f(algo);
hipDeviceSynchronize();
timer.Stop();
float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
printf("algo-%d: %.3fms\n", algo, avg_latency);
if (avg_latency < fast_latency) {
fast_latency = avg_latency;
fast_algo = algo;
}
}
printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
return fast_algo;
}
private:
int bsz, M, N, K;
rocblas_handle handle;
rocblas_operation transa, transb;
T *A, *B, *C;
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment