Unverified Commit 7cfed5f0 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

[feat] refactored extension module (#5298)

* [feat] refactored extension module

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* polish
parent d7f8db8e
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#include "q4_matrix.cuh"
#include <vector>
#include "util.cuh"
#include "matrix.cuh"
using namespace std;
const int UNSHUF_BLOCKSIZE_X = 64;
const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column
const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows
vector<Q4Matrix*> g_q4_matrices;
void g_q4_keep_matrix(Q4Matrix* m)
{
g_q4_matrices.push_back(m);
}
void g_q4_free_matrices()
{
for (const auto& m : g_q4_matrices) delete m;
g_q4_matrices.clear();
}
Q4Matrix::Q4Matrix
(
const int _height,
const int _width,
const int _groups,
uint32_t* _qweight,
uint32_t* _qzeros,
half* _scales,
uint32_t* _g_idx,
const int _device
) :
height(_height),
width(_width),
groups(_groups),
device(_device)
{
cudaSetDevice(device);
cuda_qweight = _qweight;
cuda_qzeros = _qzeros;
cuda_scales = _scales;
groupsize = height / groups;
if (_g_idx) make_sequential(_g_idx);
}
Q4Matrix::~Q4Matrix()
{
}
// Make sequential
__global__ void make_sequential_kernel
(
const uint32_t* __restrict__ w,
uint32_t* __restrict__ w_new,
const uint32_t* __restrict__ x_map,
const int w_height,
const int w_width
)
{
const uint64_t* w2 = (uint64_t*) w;
uint64_t* w_new2 = (uint64_t*) w_new;
int w2_stride = w_width >> 1;
int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
if (w2_column >= w2_stride) return;
int w_new2_row = blockIdx.y;
int x_map_idx = w_new2_row << 3;
uint64_t dst = 0;
#pragma unroll
for (int i = 0; i < 8; i++)
{
int source_row = x_map[x_map_idx++];
int w2_row = source_row >> 3;
int w2_subrow = source_row & 0x07;
int w2_row_shift = w2_subrow << 2;
int wnew2_row_shift = i << 2;
uint64_t src = w2[w2_row * w2_stride + w2_column];
src >>= w2_row_shift;
src &= 0x0000000f0000000f;
src <<= wnew2_row_shift;
dst |= src;
}
w_new2[w_new2_row * w2_stride + w2_column] = dst;
}
void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
{
uint32_t* cuda_new_qweight = NULL;
cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch
uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
// Group histogram
for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
// Group map
for (int i = 0, acc = 0; i < groups; i++)
{
short tmp = cpu_g_idx_map[i];
cpu_g_idx_map[i] = acc;
acc += tmp;
}
// X map (inverse)
for (int row = 0; row < height; row++)
{
uint32_t target_group = cpu_g_idx[row];
uint32_t target_row = cpu_g_idx_map[target_group];
cpu_g_idx_map[target_group]++;
cpu_x_map_inv[row] = target_row;
}
// X map
for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
// Move to CUDA
cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
// Rearrange rows in w
dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
dim3 blocks
(
(width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),
height / 8,
1
);
make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
// Replace qweights
cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
// Cleanup
cudaDeviceSynchronize();
cudaFree(cuda_new_qweight);
free(cpu_g_idx_map);
free(cpu_x_map);
free(cpu_x_map_inv);
}
__global__ void reconstruct_kernel
(
const uint32_t* __restrict__ w,
half* __restrict__ out, // (y)
const half* __restrict__ w_scales,
const uint32_t* __restrict__ w_zeros,
const int height,
const int width,
const int groupsize
)
{
// Start of block
int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
if (column >= width) return;
// Views
MatrixView_q4_column w_(w, height, width);
MatrixView_half_rw out_(out, height, width);
MatrixView_half w_scales_(w_scales, height / groupsize, width);
MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
// Groupsize version
int group = row / groupsize;
half w_scale = w_scales_.item(group, column);
uint32_t w_zero = w_zeros_.item(group, column) + 1;
uint32_t w_read = w_.item_uint32_t(row, column);
half* out_ptr = out_.item_ptr(row, column);
#pragma unroll
for (int s = 0; s < 32; s += 4)
{
half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
*out_ptr = w_item; out_ptr += out_.width;
}
}
void Q4Matrix::reconstruct(half* out)
{
dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
dim3 blocks
(
(width + threads.x - 1) / threads.x,
(height / 8 + threads.y - 1) / threads.y,
1
);
reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
}
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matrix_cuh
#define _q4_matrix_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
class Q4Matrix
{
public:
int device;
int height;
int width;
int groups;
int groupsize;
uint32_t* cuda_qweight = NULL;
uint32_t* cuda_qzeros = NULL;
half* cuda_scales = NULL;
uint32_t* cuda_x_map = NULL;
Q4Matrix
(
const int _height,
const int _width,
const int _groups,
uint32_t* _qweight,
uint32_t* _qzeros,
half* _scales,
uint32_t* _g_idx,
const int _device
);
~Q4Matrix();
void reconstruct(half* out);
private:
void make_sequential(const uint32_t* cpu_g_idx);
};
void g_q4_keep_matrix(Q4Matrix* m);
void g_q4_free_matrices();
#endif
\ No newline at end of file
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _tuning_h
#define _tuning_h
struct ExLlamaTuning {
int matmul_recons_thd;
bool matmul_fused_remap;
bool matmul_no_half2;
};
#endif
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _util_cuh
#define _util_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown
#else
#define cudaUnspecified cudaErrorApiFailureBase
#endif
// React to failure on return code != cudaSuccess
#define _cuda_check(fn) \
do { \
{_cuda_err = fn;} \
if (_cuda_err != cudaSuccess) goto _cuda_fail; \
} while(false)
// React to failure on return code == 0
#define _alloc_check(fn) \
do { \
if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
else _cuda_err = cudaSuccess; \
} while(false)
#endif
#include "block_reduce.h"
#include "cuda_util.h"
#include "kernels.h"
#include "ls_cub.cuh"
ls::cub::CachingDeviceAllocator g_allocator(true);
template <typename T>
__global__ void ls_cross_entropy_fw_kernel(
const T *__restrict__ inputs, const int *__restrict__ targets,
float *__restrict__ outputs, float *__restrict__ nll_loss_outputs,
const int padding_idx, const float epsilon, const int vocab_size) {
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const int block_start = blockIdx.x * vocab_size;
const int left_idx = block_start + threadIdx.x;
const int right_idx = (blockIdx.x + 1) * vocab_size;
float max_input[1] = {REDUCE_FLOAT_INF_NEG};
float sum_logits[2] = {0.f, 0.f}; // logit and logit exp
int target_tid = targets[blockIdx.x];
if (target_tid == padding_idx) {
if (threadIdx.x == 0) {
nll_loss_outputs[blockIdx.x] = 0.f;
outputs[blockIdx.x] = 0.f;
}
return;
}
for (int i = left_idx; i < right_idx; i += blockDim.x) {
max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
}
blockReduce<ReduceType::kMax, 1>(max_input);
__shared__ float s_max_input;
if (threadIdx.x == 0) {
s_max_input = max_input[0];
}
__syncthreads();
for (int i = left_idx; i < right_idx; i += blockDim.x) {
float logit = static_cast<float>(inputs[i]) - s_max_input;
sum_logits[0] += logit;
sum_logits[1] += expf(logit);
}
blockReduce<ReduceType::kSum, 2>(sum_logits);
__shared__ float s_sum_logit;
__shared__ float s_sum_exp;
if (threadIdx.x == 0) {
s_sum_logit = sum_logits[0];
s_sum_exp = sum_logits[1];
}
__syncthreads();
float eps_i = epsilon / (vocab_size - 1);
if (threadIdx.x == 0) {
// neg_log_prob = log(sum(exp(x - x_max))) - (x - x_max)
float nll_loss = logf(s_sum_exp) -
static_cast<float>(inputs[block_start + target_tid]) +
s_max_input;
nll_loss_outputs[blockIdx.x] = nll_loss;
float sum_nll_loss = vocab_size * logf(s_sum_exp) - s_sum_logit;
outputs[blockIdx.x] =
(1.f - epsilon - eps_i) * nll_loss + eps_i * sum_nll_loss;
}
}
template <typename T>
__global__ void ls_cross_entropy_bw_kernel(
const float *__restrict__ grad_outputs, const T *__restrict__ inputs,
const int *__restrict__ targets, T *__restrict__ grad_inputs,
const int padding_idx, const float epsilon, const int vocab_size) {
/* step1: compute each thread's max_logit and sum_exp_logit, store in
* max_input, sum_exp_logit */
const int block_start = blockIdx.x * vocab_size;
const int left_idx = block_start + threadIdx.x;
const int right_idx = (blockIdx.x + 1) * vocab_size;
float max_input[1] = {REDUCE_FLOAT_INF_NEG};
float sum_logits[1] = {0.f};
const float grad_out = static_cast<float>(grad_outputs[0]);
int target_tid = targets[blockIdx.x];
if (target_tid == padding_idx) {
for (int i = left_idx; i < right_idx; i += blockDim.x) {
grad_inputs[i] = 0.f;
}
return;
}
for (int i = left_idx; i < right_idx; i += blockDim.x) {
max_input[0] = fmaxf(max_input[0], static_cast<float>(inputs[i]));
}
blockReduce<ReduceType::kMax, 1>(max_input);
__shared__ float s_max_input;
if (threadIdx.x == 0) {
s_max_input = max_input[0];
}
__syncthreads();
for (int i = left_idx; i < right_idx; i += blockDim.x) {
float logit = static_cast<float>(inputs[i]) - s_max_input;
sum_logits[0] += expf(logit);
}
blockReduce<ReduceType::kSum, 1>(sum_logits);
__shared__ float s_sum_exp;
if (threadIdx.x == 0) {
s_sum_exp = sum_logits[0];
}
__syncthreads();
float eps_i = epsilon / (vocab_size - 1);
float nll_weight = 1.0 - epsilon - eps_i;
for (int i = left_idx; i < right_idx; i += blockDim.x) {
float prob = expf(static_cast<float>(inputs[i]) - s_max_input) / s_sum_exp;
float grad = 0;
grad += (vocab_size * prob - 1) * eps_i;
grad += prob * nll_weight;
if ((i - block_start) == target_tid) {
grad -= nll_weight;
}
grad_inputs[i] = grad_out * grad;
}
}
template <typename T>
void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
float *outputs_ptr, float *nll_loss_ptr,
float *loss_buffer, const int padding_idx,
const float epsilon, const int batch_size,
const int seq_len, const int vocab_size,
cudaStream_t stream) {
int grid_dim = batch_size * seq_len;
float *nll_loss_buffer = loss_buffer + grid_dim;
ls_cross_entropy_fw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
inputs_ptr, targets_ptr, loss_buffer, nll_loss_buffer, padding_idx,
epsilon, vocab_size);
int num_items = grid_dim;
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
loss_buffer, outputs_ptr,
num_items, stream));
CHECK_GPU_ERROR(
g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
loss_buffer, outputs_ptr,
num_items, stream));
CHECK_GPU_ERROR(ls::cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
nll_loss_buffer, nll_loss_ptr,
num_items, stream));
CHECK_GPU_ERROR(g_allocator.DeviceFree(d_temp_storage));
}
template void launch_cross_entropy_fw<float>(
const float *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
const float epsilon, const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream);
template void launch_cross_entropy_fw<__half>(
const __half *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
float *nll_loss_ptr, float *loss_buffer, const int padding_idx,
const float epsilon, const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream);
template <typename T>
void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
const int *targets_ptr, T *grad_inputs_ptr,
const int padding_idx, const float epsilon,
const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream) {
int grid_dim = batch_size * seq_len;
ls_cross_entropy_bw_kernel<<<grid_dim, MAX_THREADS, 0, stream>>>(
grad_outputs_ptr, inputs_ptr, targets_ptr, grad_inputs_ptr, padding_idx,
epsilon, vocab_size);
}
template void launch_cross_entropy_bw<float>(
const float *grad_outputs_ptr, const float *inputs_ptr,
const int *targets_ptr, float *grad_inputs_ptr, const int padding_idx,
const float epsilon, const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream);
template void launch_cross_entropy_bw<__half>(
const float *grad_outputs_ptr, const __half *inputs_ptr,
const int *targets_ptr, __half *grad_inputs_ptr, const int padding_idx,
const float epsilon, const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream);
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#include "cublas_wrappers.h"
int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *beta, const float *A,
const float *B, float *C, cublasGemmAlgo_t algo) {
cublasStatus_t status =
cublasGemmEx(handle, transa, transb, m, n, k, (const void *)alpha,
(const void *)A, CUDA_R_32F, (transa == CUBLAS_OP_N) ? m : k,
(const void *)B, CUDA_R_32F, (transb == CUBLAS_OP_N) ? k : n,
(const void *)beta, C, CUDA_R_32F, m, CUDA_R_32F, algo);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m, n, k, (int)status);
return EXIT_FAILURE;
}
return 0;
}
int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *beta, const __half *A,
const __half *B, __half *C, cublasGemmAlgo_t algo) {
cublasStatus_t status = cublasGemmEx(
handle, transa, transb, m, n, k, (const void *)alpha, (const void *)A,
CUDA_R_16F, (transa == CUBLAS_OP_N) ? m : k, (const void *)B, CUDA_R_16F,
(transb == CUBLAS_OP_N) ? k : n, (const void *)beta, (void *)C,
CUDA_R_16F, m, CUDA_R_32F, algo);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m, n, k, (int)status);
return EXIT_FAILURE;
}
return 0;
}
int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
const float *alpha, const float *beta,
const float *A, const float *B, float *C,
cublasOperation_t op_A, cublasOperation_t op_B,
int stride_A, int stride_B, int stride_C,
int batch, cublasGemmAlgo_t algo) {
cublasStatus_t status = cublasGemmStridedBatchedEx(
handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_32F,
(op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_32F,
(op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_32F, m, stride_C,
batch, CUDA_R_32F, algo);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, "
"error: %d) \n",
batch, m, n, k, (int)status);
return EXIT_FAILURE;
}
return 0;
}
int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
const float *alpha, const float *beta,
const __half *A, const __half *B, __half *C,
cublasOperation_t op_A, cublasOperation_t op_B,
int stride_A, int stride_B, int stride_C,
int batch, cublasGemmAlgo_t algo) {
cublasStatus_t status = cublasGemmStridedBatchedEx(
handle, op_A, op_B, m, n, k, alpha, A, CUDA_R_16F,
(op_A == CUBLAS_OP_N) ? m : k, stride_A, B, CUDA_R_16F,
(op_B == CUBLAS_OP_N) ? k : n, stride_B, beta, C, CUDA_R_16F, m, stride_C,
batch, CUDA_R_32F, algo);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m, n, k, (int)status);
return EXIT_FAILURE;
}
return 0;
}
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include "cuda_util.h"
/* GPU function guard */
std::string _cudaGetErrorString(cudaError_t error) {
return cudaGetErrorString(error);
}
std::string _cudaGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "CUBLAS_UNKNOW";
}
template <typename T>
void check_gpu_error(T result, char const *const func, const char *const file,
int const line) {
if (result) {
throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" +
std::to_string(line) +
"): " + (_cudaGetErrorString(result)) + "\n");
}
}
template void check_gpu_error<cudaError_t>(cudaError_t result,
char const *const func,
const char *const file,
int const line);
template void check_gpu_error<cublasStatus_t>(cublasStatus_t result,
char const *const func,
const char *const file,
int const line);
template <typename T>
void print_vec(const T *outv, std::string outn, int num_output_ele) {
std::cout << outn << ": ";
std::vector<T> hout(num_output_ele, (T)0);
cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(T),
cudaMemcpyDeviceToHost);
for (int i = 0; i < num_output_ele; i++) {
std::cout << hout[i] << ", ";
}
std::cout << std::endl;
}
template <>
void print_vec<__half>(const __half *outv, std::string outn,
int num_output_ele) {
std::cout << outn << ": ";
std::vector<__half> hout(num_output_ele, (__half)0.f);
cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(__half),
cudaMemcpyDeviceToHost);
for (int i = 0; i < num_output_ele; i++) {
std::cout << __half2float(hout[i]) << ", ";
}
std::cout << std::endl;
}
template void print_vec<float>(const float *outv, std::string outn,
int num_output_ele);
template void print_vec<int>(const int *outv, std::string outn,
int num_output_ele);
template void print_vec<__half>(const __half *outv, std::string outn,
int num_output_ele);
template <typename T>
T *cuda_malloc(size_t ele_num) {
size_t byte_size = ele_num * sizeof(T);
T *pdata = nullptr;
CHECK_GPU_ERROR(cudaMalloc((void **)&pdata, byte_size));
return pdata;
}
template float *cuda_malloc<float>(size_t ele_num);
template __half *cuda_malloc<__half>(size_t ele_num);
template uint8_t *cuda_malloc<uint8_t>(size_t ele_num);
void cuda_free(void *pdata) {
if (pdata != nullptr) {
cudaFree(pdata);
}
}
template <typename T>
struct _isnan {
__device__ bool operator()(T a) const { return isnan(a); }
};
template <>
struct _isnan<__half> {
__device__ bool operator()(const __half a) const { return __hisnan(a); }
};
template <typename T>
struct _isinf {
__device__ bool operator()(T a) const { return isinf(a); }
};
template <>
struct _isinf<__half> {
__device__ bool operator()(const __half a) const { return __hisinf(a); }
};
template <typename T>
void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
std::string file, int line, cudaStream_t stream) {
// check_nan_inf = 0 for checking nan
// check_nan_inf = 1 for checking inf
bool res = false;
std::string msg = file + "(" + std::to_string(line) + "): ";
if (check_nan_inf) {
msg += "nan.";
res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
data_ptr + dsize, _isnan<T>(), false,
thrust::logical_or<bool>());
} else {
msg += "inf.";
res = thrust::transform_reduce(thrust::cuda::par.on(stream), data_ptr,
data_ptr + dsize, _isinf<T>(), false,
thrust::logical_or<bool>());
}
if (res) {
throw std::runtime_error(msg);
}
std::cout << msg << " [check pass]." << std::endl;
}
template void check_nan_inf<float>(const float *data_ptr, int dsize,
bool check_nan_inf, std::string file,
int line, cudaStream_t stream);
template void check_nan_inf<__half>(const __half *data_ptr, int dsize,
bool check_nan_inf, std::string file,
int line, cudaStream_t stream);
#include <cooperative_groups.h>
#include "kernels.h"
namespace cg = cooperative_groups;
/**
@brief: fuse_transpose_bias
Calculate the sum of elements in each column of the matrix.
@thread
gridDim.x = ceil(cols / WARP_SIZE)
blockDim.x = WARP_SIZE
blockDim.y = WARP_SIZE
@param
inp: [rows, cols]
out: [cols]
rows: the number of rows in the matrix
cols: the number of cols in the matrix
*/
template <typename T>
__global__ void column_sum_reduce(const T *__restrict__ inp,
T *__restrict__ out, int rows, int cols) {
__shared__ float tile[WARP_SIZE][WARP_SIZE];
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
int idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
int y_stride = cols * WARP_SIZE;
float localSum = 0;
// Loop across matrix row
// TODO: optimize to log complexity
if (idx < cols) {
int offset = flat_2dim(threadIdx.y, idx, cols);
for (int r = threadIdx.y; r < rows; r += WARP_SIZE) {
localSum += (float)inp[offset];
offset += y_stride;
}
}
// The sum of a row in tile is equal to the sum of a col in original matrix
tile[threadIdx.x][threadIdx.y] = localSum;
__syncthreads();
// Sum the shared buffer.
// The change of threadIdx.x is continuous
float sum = tile[threadIdx.y][threadIdx.x];
__syncthreads();
// Calculate the sum of a row in tile
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
if (threadIdx.x == 0) {
int pos = flat_2dim(blockIdx.x, threadIdx.y, WARP_SIZE);
if (pos < cols) out[pos] = sum;
}
}
// [r, c] -> [c]
template <>
void launch_fuse_transpose_bias_kernel<float>(const float *inp, float *out,
int rows, int cols,
cudaStream_t stream) {
dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
dim3 block_dim(WARP_SIZE, WARP_SIZE);
column_sum_reduce<float>
<<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
}
template <>
void launch_fuse_transpose_bias_kernel<__half>(const __half *inp, __half *out,
int rows, int cols,
cudaStream_t stream) {
dim3 grid_dim((cols - 1) / WARP_SIZE + 1);
dim3 block_dim(WARP_SIZE, WARP_SIZE);
column_sum_reduce<__half>
<<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
}
/**
@brief: fused_add2
Add two matrix inp1 and inp2 to out.
@thread
gridDim.x = batch_size * seq_len
blockDim.x = min(hidden_dim, MAX_THREADS)
@param
inp1: [batch_size, seq_len, hidden_dim]
inp2: [batch_size, seq_len, hidden_dim]
out: [batch_size, seq_len, hidden_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
*/
template <typename T>
__global__ void fused_add2_kernel(T *out, const T *inp1, const T *inp2,
int hidden_dim);
template <>
__global__ void fused_add2_kernel<float>(float *out, const float *inp1,
const float *inp2, int hidden_dim) {
int row_id = blockIdx.x;
int offset = flat_2dim(row_id, 0, hidden_dim);
const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
float4 *out_4 = reinterpret_cast<float4 *>(out);
float4 vinp1;
float4 vinp2;
float4 val;
for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
vinp1 = inp1_4[offset + i];
vinp2 = inp2_4[offset + i];
val.x = vinp1.x + vinp2.x;
val.y = vinp1.y + vinp2.y;
val.z = vinp1.z + vinp2.z;
val.w = vinp1.w + vinp2.w;
out_4[offset + i] = val;
}
}
template <>
__global__ void fused_add2_kernel<__half>(__half *out, const __half *inp1,
const __half *inp2, int hidden_dim) {
int row_id = blockIdx.x;
int offset = flat_2dim(row_id, 0, hidden_dim);
const float4 *inp1_4 = reinterpret_cast<const float4 *>(inp1);
const float4 *inp2_4 = reinterpret_cast<const float4 *>(inp2);
float4 *out_4 = reinterpret_cast<float4 *>(out);
float4 vinp1;
float4 vinp2;
float4 val;
__half2 *h2_inp1 = reinterpret_cast<__half2 *>(&vinp1);
__half2 *h2_inp2 = reinterpret_cast<__half2 *>(&vinp2);
__half2 *h2_val = reinterpret_cast<__half2 *>(&val);
for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
vinp1 = inp1_4[offset + i];
vinp2 = inp2_4[offset + i];
h2_val[0] = __hadd2(h2_inp1[0], h2_inp2[0]);
h2_val[1] = __hadd2(h2_inp1[1], h2_inp2[1]);
h2_val[2] = __hadd2(h2_inp1[2], h2_inp2[2]);
h2_val[3] = __hadd2(h2_inp1[3], h2_inp2[3]);
out_4[offset + i] = val;
}
}
//[b, s, h] -> [b, s, h]
template <>
void launch_fused_add2<float>(float *out, const float *inp1, const float *inp2,
int batch_size, int seq_len, int hidden_dim,
cudaStream_t &stream) {
hidden_dim >>= 2;
dim3 grid_dim(batch_size * seq_len);
dim3 block_dim(min(hidden_dim, MAX_THREADS));
fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
hidden_dim);
}
template <>
void launch_fused_add2<__half>(__half *out, const __half *inp1,
const __half *inp2, int batch_size, int seq_len,
int hidden_dim, cudaStream_t &stream) {
hidden_dim >>= 3;
dim3 grid_dim(batch_size * seq_len);
dim3 block_dim(min(hidden_dim, MAX_THREADS));
fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(out, inp1, inp2,
hidden_dim);
}
template <typename T>
__global__ void kernel_concat3_dim1(const T *inp1, const T *inp2, T *output,
int sz0, int sz2, int sz1_1, int sz1_2) {
int nele = sz0 * sz2 * (sz1_1 + sz1_2);
int idx = flat_2dim(blockIdx.x, threadIdx.x, blockDim.x);
if (idx >= nele) {
return;
}
float4 *dst_ptr = (float4 *)output + idx;
int idx2 = idx % sz2;
idx = idx / sz2;
int idx1 = idx % (sz1_1 + sz1_2);
int idx0 = idx / (sz1_1 + sz1_2);
float4 *src_ptr = nullptr;
int sz1 = 0;
if (idx1 < sz1_1) {
sz1 = sz1_1;
src_ptr = (float4 *)inp1;
} else {
idx1 -= sz1_1;
sz1 = sz1_2;
src_ptr = (float4 *)inp2;
}
src_ptr += flat_3dim(idx0, idx1, idx2, sz1, sz2);
dst_ptr[0] = src_ptr[0];
}
template <>
void launch_concat3_dim1<float>(const float *inp1, const float *inp2,
float *output, int sz0, int sz2, int sz1_1,
int sz1_2, cudaStream_t stream) {
sz2 >>= 2;
int nele = sz0 * sz2 * (sz1_1 + sz1_2);
int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
}
template <>
void launch_concat3_dim1<__half>(const __half *inp1, const __half *inp2,
__half *output, int sz0, int sz2, int sz1_1,
int sz1_2, cudaStream_t stream) {
sz2 >>= 3;
int nele = sz0 * sz2 * (sz1_1 + sz1_2);
int nblock = (nele + MAX_THREADS - 1) / MAX_THREADS;
kernel_concat3_dim1<<<nblock, MAX_THREADS, 0, stream>>>(
inp1, inp2, output, sz0, sz2, sz1_1, sz1_2);
}
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <iostream>
#include <string>
#include "cuda_util.h"
class Context {
public:
Context() : _stream(nullptr) {
CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
}
virtual ~Context() {}
static Context &Instance() {
static Context _ctx;
return _ctx;
}
void set_stream(cudaStream_t stream) {
_stream = stream;
CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
}
cudaStream_t get_stream() { return _stream; }
cublasHandle_t get_cublashandle() { return _cublasHandle; }
private:
cudaStream_t _stream;
cublasHandle_t _cublasHandle;
};
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <type_traits>
#include "cuda_util.h"
template <typename T>
class CrossEntropyLayer {
public:
CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens);
virtual ~CrossEntropyLayer();
void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
float *nll_loss_ptr);
void Backward(const float *grad_outputs_ptr, const T *inputs_ptr,
const int *targets_ptr, T *grad_inputs_ptr);
void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size);
private:
void allocate_mem_buffer() {
// allocate local gpu memory
_loss_buffer = cuda_malloc<float>(_max_batch_tokens * 2);
}
void free_mem_buffer() {
// free local gpu memory
cuda_free(_loss_buffer);
}
const int _padding_idx;
const float _epsilon;
const int _max_batch_tokens;
size_t _batch_size;
size_t _seq_len;
size_t _vocab_size;
float *_loss_buffer;
};
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *beta, const float *A,
const float *B, float *C,
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *beta, const __half *A,
const __half *B, __half *C,
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
int cublas_strided_batched_gemm(cublasHandle_t handle, int m, int n, int k,
const float *alpha, const float *beta,
const float *A, const float *B, float *C,
cublasOperation_t op_A, cublasOperation_t op_B,
int stride_A, int stride_B, int stride_C,
int batch,
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
int cublas_strided_batched_gemm(
cublasHandle_t handle, int m, int n, int k, const float *alpha,
const float *beta, const __half *A, const __half *B, __half *C,
cublasOperation_t op_A, cublasOperation_t op_B, int stride_A, int stride_B,
int stride_C, int batch,
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <math_constants.h>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <type_traits>
#include <vector>
template <typename T>
void check_gpu_error(T result, char const *const func, const char *const file,
int const line);
#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
template <typename T>
void print_vec(const T *outv, std::string outn, int num_output_ele);
template <typename T>
T *cuda_malloc(size_t ele_num);
void cuda_free(void *pdata);
template <typename T>
void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
std::string file, int line, cudaStream_t stream);
#define CHECK_NAN_INF(ptr, size, stream) \
check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <string>
#include "kernels.h"
template <typename T>
class Dropout {
public:
struct Config {
float ratio;
bool training;
Config(float r) : ratio(r), training(true) {}
float RATIO() const { return training ? ratio : 0.0; }
};
Dropout(const Config &config, size_t max_ele_num)
: _config(config), _mask(nullptr) {
_mask = cuda_malloc<uint8_t>(max_ele_num);
}
virtual ~Dropout() { cuda_free(_mask); }
// after attention softmax
void dropout(T *output, const T *input, int count, cudaStream_t stream,
bool bwd = false) {
launch_ls_dropout<T>(output, input, _mask, count, _config.RATIO(), stream,
bwd);
}
void d_dropout(T *d_inp_out, int count, cudaStream_t stream) {
launch_ls_dropout<T>(d_inp_out, d_inp_out, _mask, count, _config.RATIO(),
stream, true);
}
// transformer layer's postprocessing dropout, after attn or ffn module,
// before residual add.
void bias_dropout_residual(T *output, const T *input, const T *residual,
const T *bias, int rows, int cols,
cudaStream_t stream) {
launch_ls_dropout_res_bias<T>(output, input, _mask, bias, residual,
rows * cols, cols, _config.RATIO(), stream);
}
void d_bias_dropout_residual(T *d_input, T *d_bias, const T *d_output,
int rows, int cols, cudaStream_t stream) {
launch_ls_dropout_bias_bwd<T>(d_input, d_bias, d_output, _mask, rows, cols,
_config.RATIO(), stream);
}
// dropout inside ffn.
void bias_act_dropout(T *output, const T *input, const T *bias, int rows,
int cols, std::string activation_fn,
cudaStream_t stream) {
if (activation_fn == "relu") {
launch_ls_dropout_act_bias<ActivationType::kRelu, T>(
output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
stream);
} else if (activation_fn == "gelu") {
launch_ls_dropout_act_bias<ActivationType::kGelu, T>(
output, input, _mask, bias, rows * cols, cols, _config.RATIO(),
stream);
} else {
throw std::runtime_error("not supported activation: " + activation_fn);
}
}
void d_bias_act_dropout(T *d_inp_out, T *d_bias_out, const T *input,
const T *bias, int rows, int cols,
std::string activation_fn, cudaStream_t stream) {
if (activation_fn == "relu") {
launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, T>(
d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
_config.RATIO(), stream);
} else if (activation_fn == "gelu") {
launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, T>(
d_inp_out, d_bias_out, input, bias, d_inp_out, _mask, rows, cols,
_config.RATIO(), stream);
} else {
throw std::runtime_error("not supported activation: " + activation_fn);
}
}
bool HasDropout() const { return _config.RATIO() > 0.0; }
void SetTrainingMode(bool training) { _config.training = training; }
private:
uint8_t *_mask;
Config _config;
};
#pragma once
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <array>
#include "cublas_wrappers.h"
#include "kernels.h"
template <typename T>
class FeedForward {
public:
struct Config {
int outputSize;
int inputSize;
std::array<int, 3> gemm_algos;
Config(int outputs, int inputs)
: outputSize(outputs),
inputSize(inputs),
gemm_algos(std::array<int, 3>({99, 99, 99})) {}
};
FeedForward(Config config) : config_(config) {}
~FeedForward() {}
void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
cublasHandle_t &_cublasHandle) {
float alpha = T(1.);
float beta = T(0.);
cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
out, cublasGemmAlgo_t(config_.gemm_algos[0]));
}
void Backward(int bsz, const T *out_grad, const T *input_ptr,
const T *weights, T *weights_grad, T *bias_grad,
cublasHandle_t &_cublasHandle, cudaStream_t &stream,
T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
bool compute_bias = true) {
float alpha = (T)1.0, beta = (T)0.0;
cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
if (compute_bias) {
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
config_.outputSize, stream);
}
}
void reset_size(int outputSize, int inputSize) {
config_.outputSize = outputSize;
config_.inputSize = inputSize;
}
private:
Config config_;
};
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#define MAX_THREADS 1024
#define WARP_SIZE 32
enum class ActivationType { kRelu, kGelu };
void launch_curand_init(int total_count, int dim, cudaStream_t stream);
template <typename T>
void launch_layer_norm(T *ln_res, T *vars, T *means, const T *inp,
const T *scale, const T *bias, int batch_size,
int hidden_dim, cudaStream_t stream);
template <typename T>
void launch_ln_bw(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
const T *residual_grad, const T *inp_or_out, const T *gamma,
const T *betta, const T *vars, const T *means, int batch,
int hidden_dim, cudaStream_t stream[2]);
template <typename T>
void launch_attn_softmax(T *vals, const T *attn_mask, int batch_size, int heads,
int from_len, int to_len, bool mask_future,
cudaStream_t stream);
template <typename T>
void launch_attn_softmax_bw(T *out_grad, const T *soft_inp, int rows,
int softmax_len, cudaStream_t stream);
// [b, s, h] -> [b, nh, s, ad]
template <typename T>
void launch_transform_0213(T *output, const T *vals, int batch_size,
int seq_length, int hidden_dim, int nhead,
cudaStream_t stream);
// [b, s, 3, h] -> [3, b, nh, s, ad]
template <typename T>
void launch_bias_add_transform_20314(T *output, const T *input, const T *bias,
int dim_0, int dim_1, int dim_2, int dim_3,
int dim_4, cudaStream_t stream);
// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
template <typename T>
void launch_transform4d_0213(T *output, const T *vals, int batch_size,
int seq_len, int hidden_dim, int nhead,
int trans_count, cudaStream_t stream);
template <typename T>
void launch_ls_dropout(T *out, const T *vals, uint8_t *mask, int total_count,
float ratio, cudaStream_t stream, bool backward = false);
template <typename T>
void launch_ls_dropout_res_bias(T *out, const T *vals, uint8_t *mask,
const T *bias, const T *residual,
int total_count, int dim, float ratio,
cudaStream_t stream);
template <ActivationType, typename T>
void launch_ls_dropout_act_bias(T *out, const T *vals, uint8_t *mask,
const T *bias, int total_count, int dim,
float ratio, cudaStream_t stream);
template <typename T>
void launch_ls_dropout_bias_bwd(T *in_grad, T *bias_grad, const T *out_grad,
const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);
template <ActivationType act_type, typename T>
void launch_ls_dropout_act_bias_bwd(T *in_grad, T *bias_grad, const T *input,
const T *bias, const T *out_grad,
const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);
template <typename T>
void launch_fuse_transpose_bias_kernel(const T *inp, T *out, int rows, int cols,
cudaStream_t stream);
void launch_param_update(const float *input, __half *output, int size,
cudaStream_t stream);
template <typename T>
void launch_concat3_dim1(const T *inp1, const T *inp2, T *output, int sz0,
int sz2, int sz1_1, int sz1_2, cudaStream_t stream);
template <typename T>
void launch_fused_add2(T *out, const T *inp1, const T *inp2, int batch_size,
int seq_len, int hidden_size, cudaStream_t &stream);
template <typename T>
void launch_cross_entropy_fw(const T *inputs_ptr, const int *targets_ptr,
float *outputs_ptr, float *nll_loss_ptr,
float *loss_buffer, const int padding_idx,
const float epsilon, const int batch_size,
const int seq_len, const int vocab_size,
cudaStream_t stream);
template <typename T>
void launch_cross_entropy_bw(const float *grad_outputs_ptr, const T *inputs_ptr,
const int *targets_ptr, T *grad_inputs_ptr,
const int padding_idx, const float epsilon,
const int batch_size, const int seq_len,
const int vocab_size, cudaStream_t stream);
template <typename T>
void launch_lookup_scale_pos_dropout(
T *output, const int *input, const T *embeddings, const T *pos_embeddings,
uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
int padding_idx, float dropout_ratio, int step, cudaStream_t &stream);
template <typename T>
void launch_d_lookup_scale_pos_dropout(
T *grad_embeddings, const T *grad_output, const int *input,
const uint8_t *dropout_mask, int batch_size, int seq_len, int embedding_dim,
int vocab_size, int padding_idx, float dropout_ratio, cudaStream_t &stream);
/* Convert 2-dim tensor index into vector index */
__forceinline__ __host__ __device__ int flat_2dim(int id1, int id2, int dim2) {
return id1 * dim2 + id2;
}
/* Convert 3-dim tensor index into vector index */
__forceinline__ __host__ __device__ int flat_3dim(int id1, int id2, int id3,
int dim2, int dim3) {
return id1 * dim2 * dim3 + id2 * dim3 + id3;
}
/* Convert 4-dim tensor index into vector index */
__forceinline__ __host__ __device__ int flat_4dim(int id1, int id2, int id3,
int id4, int dim2, int dim3,
int dim4) {
// return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
int res = id4;
int ld = dim4;
res += id3 * ld;
ld *= dim3;
res += id2 * ld;
ld *= dim2;
res += id1 * ld;
return res;
}
/* Convert 5-dim tensor index into vector index */
__forceinline__ __host__ __device__ int flat_5dim(int id1, int id2, int id3,
int id4, int id5, int dim2,
int dim3, int dim4,
int dim5) {
// return id1*(dim2*dim3*dim4*dim5) + id2*(dim3*dim4*dim5) + id3*(dim4*dim5) +
// id4*dim5 + dim5;
int res = id5;
int ld = dim5;
res += id4 * ld;
ld *= dim4;
res += id3 * ld;
ld *= dim3;
res += id2 * ld;
ld *= dim2;
res += id1 * ld;
return res;
}
/* Convert 6-dim tensor index into vector index */
__forceinline__ __host__ __device__ int flat_6dim(int id1, int id2, int id3,
int id4, int id5, int id6,
int dim2, int dim3, int dim4,
int dim5, int dim6) {
// return id1*(dim2*dim3*dim4*dim5*dim6) + id2*(dim3*dim4*dim5*dim6) +
// id3*(dim4*dim5*dim6) + id4*(dim5*dim6) + id5*dim6 + id6;
int res = id6;
int ld = dim6;
res += id5 * ld;
ld *= dim5;
res += id4 * ld;
ld *= dim4;
res += id3 * ld;
ld *= dim3;
res += id2 * ld;
ld *= dim2;
res += id1 * ld;
return res;
}
/* Convert vector index to 6-dim tensor index */
__forceinline__ __host__ __device__ void decompose_6dim(
int src, int dim1, int dim2, int dim3, int dim4, int dim5, int *id0,
int *id1, int *id2, int *id3, int *id4, int *id5) {
*id5 = src % dim5;
src /= dim5;
*id4 = src % dim4;
src /= dim4;
*id3 = src % dim3;
src /= dim3;
*id2 = src % dim2;
src /= dim2;
*id1 = src % dim1;
*id0 = src / dim1;
}
/* Convert vector index to 5-dim tensor index */
__forceinline__ __host__ __device__ void decompose_5dim(int src, int dim1,
int dim2, int dim3,
int dim4, int *id0,
int *id1, int *id2,
int *id3, int *id4) {
*id4 = src % dim4;
src /= dim4;
*id3 = src % dim3;
src /= dim3;
*id2 = src % dim2;
src /= dim2;
*id1 = src % dim1;
*id0 = src / dim1;
}
/* Convert vector index to 4-dim tensor index */
__forceinline__ __host__ __device__ void decompose_4dim(int src, int dim1,
int dim2, int dim3,
int *id0, int *id1,
int *id2, int *id3) {
*id3 = src % dim3;
src /= dim3;
*id2 = src % dim2;
src /= dim2;
*id1 = src % dim1;
*id0 = src / dim1;
}
/* Convert vector index to 3-dim tensor index */
__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
int dim2, int *id0,
int *id1, int *id2) {
*id2 = src % dim2;
src /= dim2;
*id1 = src % dim1;
*id0 = src / dim1;
}
/* Convert vector index to 2-dim tensor index */
__forceinline__ __host__ __device__ void decompose_2dim(int src, int dim1,
int *id0, int *id1) {
*id1 = src % dim1;
*id0 = src / dim1;
}
// copied from https://github.com/dmlc/dgl/pull/2758
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define CUB_NS_PREFIX namespace ls {
#define CUB_NS_POSTFIX }
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using namespace std;
template <typename T>
class Normalize_Layer {
public:
struct Config {
uint32_t hidden_dim;
bool use_mean;
Config(uint32_t hidden_dim, bool use_mean = false)
: hidden_dim(hidden_dim), use_mean(use_mean) {}
};
Normalize_Layer(Config config, size_t max_rows)
: config_(config), vars_(nullptr), means_(nullptr) {
vars_ = cuda_malloc<T>(max_rows);
if (config_.use_mean) {
means_ = cuda_malloc<T>(max_rows);
}
}
~Normalize_Layer() {
cuda_free(vars_);
cuda_free(means_);
}
void Forward(T *ln_res, const T *inp, const T *gamma, const T *betta,
int batch_size, cudaStream_t stream) {
launch_layer_norm(ln_res, vars_, means_, inp, gamma, betta, batch_size,
config_.hidden_dim, stream);
}
/*
residual_grad, inp_or_out, betta should be treated carefully.
inp_or_out = input if use_mean else output
residual_grad, betta can be nullptr.
residual_grad will be added to dinp if it is not nullptr
which is useful in transformer layer when pre-ln
betta are only used to compute xhat,
(use_mean == false) ^ (betta == nullptr) should be true
*/
void Backward(T *gamma_grad, T *betta_grad, T *inp_grad, const T *out_grad,
const T *residual_grad, const T *inp_or_out, const T *gamma,
const T *betta, int batch_size, cudaStream_t stream[2]) {
launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
inp_or_out, gamma, betta, vars_, means_, batch_size,
config_.hidden_dim, stream);
}
inline bool use_mean() const { return config_.use_mean; }
private:
Config config_;
T *vars_;
T *means_;
};
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "kernels.h"
using namespace std;
template <typename T>
class Softmax {
public:
struct Config {
size_t nhead;
Config(size_t nhead) : nhead(nhead) {}
};
Softmax(Config config) : config_(config) {}
~Softmax() {}
void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
int to_len, cudaStream_t &stream, bool mask_future = true) {
launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
to_len, mask_future, stream);
}
void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
int to_len, cudaStream_t stream) {
launch_attn_softmax_bw<T>(out_grad, soft_out,
batch_size * config_.nhead * from_len, to_len,
stream);
}
void reset_size(size_t nhead) { config_.nhead = nhead; }
private:
Config config_;
};
/* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <array>
#include "cublas_wrappers.h"
template <typename T>
class StridedBatchGemm {
public:
struct Config {
int m;
int n;
int k;
float alpha;
float beta;
cublasOperation_t op_A;
cublasOperation_t op_B;
std::array<int, 3> gemm_algos;
Config(float param_alpha, float param_beta, cublasOperation_t opA,
cublasOperation_t opB)
: alpha(param_alpha),
beta(param_beta),
op_A(opA),
op_B(opB),
gemm_algos(std::array<int, 3>({99, 99, 99})) {}
void SetConfig(int mm, int nn, int kk) {
m = mm;
n = nn;
k = kk;
}
};
StridedBatchGemm(const Config &config) : _config(config) {}
virtual ~StridedBatchGemm() {}
void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b,
cublasHandle_t handle) {
int stride_a = _config.m * _config.k;
int stride_b = _config.n * _config.k;
int stride_c = _config.m * _config.n;
cublas_strided_batched_gemm(
handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
_buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0]));
}
void Backward(int bsz, const T *d_output, const T *_buffer_a,
const T *_buffer_b, cublasHandle_t handle,
T *inpGradA = nullptr, T *inpGradB = nullptr) {
int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
int stride_a = mb * _config.n;
int stride_b = _config.n * kb;
int stride_c = _config.m * _config.k;
// B need to transpose.
cublasOperation_t op_b =
(_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
// Calculate d_A.
cublas_strided_batched_gemm(
handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
(_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
(_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA,
CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz,
cublasGemmAlgo_t(_config.gemm_algos[1]));
// A need to transpose.
cublasOperation_t op_a =
(_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
stride_a = _config.m * _config.k;
stride_b = _config.m * _config.n;
stride_c = _config.n * _config.k;
// Calculate d_B.
cublas_strided_batched_gemm(
handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
_buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,
stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2]));
}
inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
private:
Config _config;
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment