Commit c25a91b6 authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!2
parents d1596c94 af82b300
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#include <limits>
#include "conversion_utils.h"
#include "inference_cuda_layers.h"
#ifndef __HIP_PLATFORM_HCC__
......@@ -12,7 +14,6 @@ Copyright 2022 The Microsoft DeepSpeed Team
#include <cstdlib>
#include <ctime>
#define ATTN_THREADS 256
#define MAX_REG_SIZE 8
#define minus_infinity -10000.0
......@@ -30,9 +31,10 @@ void CheckCudaErrorAux(const char* file, unsigned line)
namespace cg = cooperative_groups;
__global__ void attn_softmax_v2(__half* vals,
__half* mask,
__half* alibi,
template <typename T, int iterations>
__global__ void attn_softmax_v2(T* vals,
T* mask,
T* alibi,
float layer_scale,
bool triangular,
bool recompute,
......@@ -45,7 +47,6 @@ __global__ void attn_softmax_v2(__half* vals,
int head_offset,
int mask_stride,
int mp_size,
int iterations,
int reduceWidth)
{
cg::thread_block b = cg::this_thread_block();
......@@ -53,7 +54,7 @@ __global__ void attn_softmax_v2(__half* vals,
float2 low_data[MAX_REG_SIZE];
float2 high_data[MAX_REG_SIZE];
const __half zero_h = __float2half(0.f);
const T zero_h = conversion::to<T>(0.f);
int wid = threadIdx.x >> 5;
int lane = threadIdx.x & 0x1f;
......@@ -75,7 +76,6 @@ __global__ void attn_softmax_v2(__half* vals,
alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
mask_offset = mask_offset * sequence_length;
int seq_id = iter_offset % num_seq;
int seq_id4 = seq_id >> 2;
int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
......@@ -87,83 +87,109 @@ __global__ void attn_softmax_v2(__half* vals,
float max_val = minus_infinity;
// if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
for (int i = 0; i < iterations; i++) {
int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
data_id < sequence_length) {
if ((sequence_length - data_id) >= 4) {
low_data[i].x = data_id > window_stride
? __half2float(vals[data_id]) * layer_scale
: minus_infinity;
low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
(data_id + 1) > window_stride)
? __half2float(vals[data_id + 1]) * layer_scale
: minus_infinity;
high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
(data_id + 2) > window_stride)
? __half2float(vals[data_id + 2]) * layer_scale
: minus_infinity;
high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
(data_id + 3) > window_stride)
? __half2float(vals[data_id + 3]) * layer_scale
: minus_infinity;
if (alibi) {
low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
low_data[i].y =
low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
high_data[i].x =
high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
high_data[i].y =
high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
}
if (mask) {
low_data[i].x += __half2float(mask[data_id + mask_offset]);
low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
}
} else {
low_data[i].x = data_id > window_stride
? __half2float(vals[data_id]) * layer_scale
: minus_infinity;
low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
(data_id + 1) > window_stride) &&
(data_id + 1) < sequence_length)
? __half2float(vals[data_id + 1]) * layer_scale
: minus_infinity;
high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
(data_id + 2) > window_stride) &&
(data_id + 2) < sequence_length)
? __half2float(vals[data_id + 2]) * layer_scale
: minus_infinity;
if (alibi) {
low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
if ((data_id + 1) < sequence_length)
low_data[i].y =
low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
if ((data_id + 2) < sequence_length)
high_data[i].x =
high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
}
high_data[i].y = minus_infinity;
if (mask) {
low_data[i].x += __half2float(mask[data_id + mask_offset]);
if ((data_id + 1) < sequence_length)
low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
if ((data_id + 2) < sequence_length)
high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
}
}
// if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
int data_id = i * (reduceWidth << 2) + (seq_lane);
bool check = (data_id >> 2) >= window_stride4;
bool low_x_check = check && (data_id < sequence_length) &&
(!triangular || (data_id <= seq_id)) && (data_id > window_stride);
bool low_y_check = check && ((data_id + reduceWidth) < sequence_length) &&
(!triangular || ((data_id + reduceWidth) <= seq_id)) &&
((data_id + reduceWidth) > window_stride);
bool high_x_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
(!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
((data_id + reduceWidth * 2) > window_stride);
bool high_y_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
(!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
((data_id + reduceWidth * 3) > window_stride);
if (mask && alibi) {
low_data[i].x = low_x_check
? conversion::to<float>(vals[data_id]) * layer_scale +
(conversion::to<float>(alibi[data_id + alibi_offset])) +
(conversion::to<float>(mask[data_id + mask_offset]))
: minus_infinity;
low_data[i].y =
low_y_check
? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
(conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth])) +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
: minus_infinity;
high_data[i].x =
high_x_check
? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
(conversion::to<float>(
alibi[data_id + alibi_offset + reduceWidth * 2])) +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
: minus_infinity;
high_data[i].y =
high_y_check
? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
(conversion::to<float>(
alibi[data_id + alibi_offset + reduceWidth * 3])) +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
: minus_infinity;
} else if (mask) {
low_data[i].x = low_x_check
? conversion::to<float>(vals[data_id]) * layer_scale +
(conversion::to<float>(mask[data_id + mask_offset]))
: minus_infinity;
low_data[i].y =
low_y_check
? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
: minus_infinity;
high_data[i].x =
high_x_check
? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
: minus_infinity;
high_data[i].y =
high_y_check
? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
(conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
: minus_infinity;
} else if (alibi) {
low_data[i].x = low_x_check
? conversion::to<float>(vals[data_id]) * layer_scale +
(conversion::to<float>(alibi[data_id + alibi_offset]))
: minus_infinity;
low_data[i].y =
low_y_check
? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
(conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth]))
: minus_infinity;
high_data[i].x =
high_x_check
? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
(conversion::to<float>(
alibi[data_id + alibi_offset + reduceWidth * 2]))
: minus_infinity;
high_data[i].y =
high_y_check
? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
(conversion::to<float>(
alibi[data_id + alibi_offset + reduceWidth * 3]))
: minus_infinity;
} else {
low_data[i].x = minus_infinity;
low_data[i].y = minus_infinity;
high_data[i].x = minus_infinity;
high_data[i].y = minus_infinity;
low_data[i].x = low_x_check ? conversion::to<float>(vals[data_id]) * layer_scale
: minus_infinity;
low_data[i].y =
low_y_check ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale
: minus_infinity;
high_data[i].x =
high_x_check
? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale
: minus_infinity;
high_data[i].y =
high_y_check
? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale
: minus_infinity;
}
// if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
}
for (int i = 1; i < WARP_SIZE; i *= 2) {
......@@ -212,26 +238,21 @@ __global__ void attn_softmax_v2(__half* vals,
}
sum += 1e-6;
for (int i = 0; i < iterations; i++) {
int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
int data_id = i * (reduceWidth << 2) + (seq_lane);
if (data_id < sequence_length) {
if ((sequence_length - data_id) >= 4) {
vals[data_id] = __float2half(low_data[i].x / sum);
vals[data_id + 1] = __float2half(low_data[i].y / sum);
vals[data_id + 2] = __float2half(high_data[i].x / sum);
vals[data_id + 3] = __float2half(high_data[i].y / sum);
} else {
vals[data_id] = __float2half(low_data[i].x / sum);
if ((data_id + 1) < sequence_length)
vals[data_id + 1] = __float2half(low_data[i].y / sum);
if ((data_id + 2) < sequence_length)
vals[data_id + 2] = __float2half(high_data[i].x / sum);
}
vals[data_id] = conversion::to<T>(low_data[i].x / sum);
if ((data_id + reduceWidth) < sequence_length)
vals[data_id + reduceWidth] = conversion::to<T>(low_data[i].y / sum);
if ((data_id + reduceWidth * 2) < sequence_length)
vals[data_id + reduceWidth * 2] = conversion::to<T>(high_data[i].x / sum);
if ((data_id + reduceWidth * 3) < sequence_length)
vals[data_id + reduceWidth * 3] = conversion::to<T>(high_data[i].y / sum);
}
}
}
}
template <int iterations>
__global__ void attn_softmax_v2(float* vals,
float* attn_mask,
float* alibi,
......@@ -247,7 +268,6 @@ __global__ void attn_softmax_v2(float* vals,
int head_offset,
int mask_stride,
int mp_size,
int iterations,
int reduceWidth)
{
cg::thread_block b = cg::this_thread_block();
......@@ -269,11 +289,9 @@ __global__ void attn_softmax_v2(float* vals,
vals += (iter_offset * sequence_length);
int batch_idx = iter_offset / (num_seq * heads);
int alibi_offset = batch_idx * heads * mp_size + head_offset;
int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
mask_offset = mask_offset * sequence_length;
int seq_id = iter_offset % num_seq;
int seq_id4 = seq_id >> 2;
int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
......@@ -285,58 +303,43 @@ __global__ void attn_softmax_v2(float* vals,
float max_val = minus_infinity;
for (int i = 0; i < iterations; i++) {
int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
data_id < sequence_length) {
if ((sequence_length - data_id) >= 4) {
data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
(data_id + 1) > window_stride)
? vals[data_id + 1]
int data_id = i * (reduceWidth << 2) + (seq_lane);
bool check = (data_id >> 2) >= window_stride4;
bool x_check = check && (data_id < sequence_length) &&
(!triangular || (data_id <= seq_id)) && (data_id > window_stride);
bool y_check = check && ((data_id + reduceWidth) < sequence_length) &&
(!triangular || ((data_id + reduceWidth) <= seq_id)) &&
((data_id + reduceWidth) > window_stride);
bool z_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
(!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
((data_id + reduceWidth * 2) > window_stride);
bool w_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
(!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
((data_id + reduceWidth * 3) > window_stride);
if (attn_mask) {
data[i].x = x_check ? vals[data_id] + attn_mask[data_id + mask_offset]
: minus_infinity;
data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
(data_id + 2) > window_stride)
? vals[data_id + 2]
data[i].y = y_check ? vals[data_id + reduceWidth] +
attn_mask[data_id + mask_offset + reduceWidth]
: minus_infinity;
data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
(data_id + 3) > window_stride)
? vals[data_id + 3]
data[i].z = z_check ? vals[data_id + reduceWidth * 2] +
attn_mask[data_id + mask_offset + reduceWidth * 2]
: minus_infinity;
if (attn_mask) {
data[i].x += attn_mask[data_id + mask_offset];
data[i].y += attn_mask[data_id + mask_offset + 1];
data[i].z += attn_mask[data_id + mask_offset + 2];
data[i].w += attn_mask[data_id + mask_offset + 3];
}
} else {
data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
(data_id + 1) > window_stride && (data_id + 1) < sequence_length)
? (vals[data_id + 1])
data[i].w = w_check ? vals[data_id + reduceWidth * 3] +
attn_mask[data_id + mask_offset + reduceWidth * 3]
: minus_infinity;
data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
(data_id + 2) > window_stride && (data_id + 2) < sequence_length)
? (vals[data_id + 2])
: minus_infinity;
data[i].w = minus_infinity;
if (attn_mask) {
data[i].x += attn_mask[data_id + mask_offset];
if ((data_id + 1) < sequence_length)
data[i].y += attn_mask[data_id + mask_offset + 1];
if ((data_id + 2) < sequence_length)
data[i].z += attn_mask[data_id + mask_offset + 2];
}
}
max_val = (data[i].x > max_val ? data[i].x : max_val);
max_val = (data[i].y > max_val ? data[i].y : max_val);
max_val = (data[i].z > max_val ? data[i].z : max_val);
max_val = (data[i].w > max_val ? data[i].w : max_val);
} else {
data[i].x = minus_infinity;
data[i].y = minus_infinity;
data[i].z = minus_infinity;
data[i].w = minus_infinity;
data[i].x = x_check ? vals[data_id] : minus_infinity;
data[i].y = y_check ? vals[data_id + reduceWidth] : minus_infinity;
data[i].z = z_check ? vals[data_id + reduceWidth * 2] : minus_infinity;
data[i].w = w_check ? vals[data_id + reduceWidth * 3] : minus_infinity;
}
max_val = (data[i].x > max_val ? data[i].x : max_val);
max_val = (data[i].y > max_val ? data[i].y : max_val);
max_val = (data[i].z > max_val ? data[i].z : max_val);
max_val = (data[i].w > max_val ? data[i].w : max_val);
}
for (int i = 1; i < WARP_SIZE; i *= 2) {
......@@ -387,24 +390,38 @@ __global__ void attn_softmax_v2(float* vals,
sum += 1e-6;
for (int i = 0; i < iterations; i++) {
int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
int data_id = i * (reduceWidth << 2) + (seq_lane);
if (data_id < sequence_length) {
if ((sequence_length - data_id) >= 4) {
vals[data_id] = data[i].x / sum;
vals[data_id + 1] = data[i].y / sum;
vals[data_id + 2] = data[i].z / sum;
vals[data_id + 3] = data[i].w / sum;
} else {
vals[data_id] = data[i].x / sum;
if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
}
vals[data_id] = data[i].x / sum;
if ((data_id + reduceWidth) < sequence_length)
vals[data_id + reduceWidth] = data[i].y / sum;
if ((data_id + reduceWidth * 2) < sequence_length)
vals[data_id + reduceWidth * 2] = data[i].z / sum;
if ((data_id + reduceWidth * 3) < sequence_length)
vals[data_id + reduceWidth * 3] = data[i].w / sum;
}
}
}
}
#define LAUNCH_ATTN_SOFTMAX_V2(iterations) \
attn_softmax_v2<T, iterations><<<grid, block, 0, stream>>>(vals, \
mask, \
alibi, \
layer_scale, \
triangular, \
recompute, \
local_attention, \
window_size, \
total_count, \
heads, \
sequence_length, \
num_seq, \
head_offset, \
mask_stride, \
mp_size, \
reduce_width);
template <typename T>
void launch_attn_softmax_v2(T* vals,
T* mask,
......@@ -423,34 +440,50 @@ void launch_attn_softmax_v2(T* vals,
int mp_size,
cudaStream_t stream)
{
int total_count = batch_size * heads * num_seq;
int warp_num = ATTN_THREADS / WARP_SIZE;
int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
dim3 block_dim(ATTN_THREADS);
const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
if (sequence_length <= 32768)
attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
mask,
alibi,
layer_scale,
triangular,
recompute,
local_attention,
window_size,
total_count,
heads,
sequence_length,
num_seq,
head_offset,
mask_stride,
mp_size,
iterations,
reduce_width);
else
const int total_count = batch_size * heads * num_seq;
// Scheduling Overview
// 4 element unroll with power of 2 `reduce_width` threads to a ceiling of `attn_threads`
// Each block should be partitioned into as many `reduce_width` blocks
// as can be fit.
constexpr int attn_threads = 256;
constexpr int min_reduce_width = hw_warp_size;
constexpr int internal_unroll = 4;
// Handle internal unroll then round to next power of 2. Bump up to minimum granularity.
const int thread_steps_rounded =
next_pow2((sequence_length + internal_unroll - 1) / internal_unroll);
const int thread_steps_schedule =
(thread_steps_rounded < min_reduce_width) ? min_reduce_width : thread_steps_rounded;
// Bound reduce width to the number of threads
const int reduce_width = (thread_steps_schedule < attn_threads) ? thread_steps_schedule
: attn_threads;
// Scale for the excess
const int iterations = thread_steps_schedule / reduce_width;
// Should be safe since reduce_width is capped to attn_threads
const int partitions = attn_threads / reduce_width;
// Launch params
dim3 grid((total_count + partitions - 1) / partitions);
dim3 block(attn_threads);
if (sequence_length <= 32768) {
if (iterations == 1) {
LAUNCH_ATTN_SOFTMAX_V2(1);
} else if (iterations == 2) {
LAUNCH_ATTN_SOFTMAX_V2(2);
} else if (iterations == 4) {
LAUNCH_ATTN_SOFTMAX_V2(4);
} else if (iterations == 8) {
LAUNCH_ATTN_SOFTMAX_V2(8);
} else if (iterations == 16) {
LAUNCH_ATTN_SOFTMAX_V2(16);
} else if (iterations == 32) {
LAUNCH_ATTN_SOFTMAX_V2(32);
} else if (iterations == 64) {
LAUNCH_ATTN_SOFTMAX_V2(64);
}
} else
throw std::runtime_error("Unsupport Seq_Length!");
}
......@@ -470,6 +503,26 @@ template void launch_attn_softmax_v2(float* vals,
int mask_stride,
int mp_size,
cudaStream_t stream);
#ifdef BF16_AVAILABLE
template void launch_attn_softmax_v2(__nv_bfloat16* vals,
__nv_bfloat16* mask,
__nv_bfloat16* alibi,
float layer_scale,
bool triangular,
bool recompute,
bool local_attention,
int window_size,
int batch_size,
int heads,
int num_seq,
int sequence_length,
int head_offset,
int mask_stride,
int mp_size,
cudaStream_t stream);
#endif
template void launch_attn_softmax_v2(__half* vals,
__half* mask,
__half* alibi,
......@@ -486,3 +539,53 @@ template void launch_attn_softmax_v2(__half* vals,
int mask_stride,
int mp_size,
cudaStream_t stream);
#define DEF_ATTN_SOFTMAX_V2_HALF(_iter) \
template __global__ void attn_softmax_v2<__half, _iter>(__half * vals, \
__half * mask, \
__half * alibi, \
float layer_scale, \
bool triangular, \
bool recompute, \
bool local_attention, \
int window_size, \
int total_count, \
int heads, \
int sequence_length, \
int num_seq, \
int head_offset, \
int mask_stride, \
int mp_size, \
int reduceWidth)
#define DEF_ATTN_SOFTMAX_V2_BF16(_iter) \
template __global__ void attn_softmax_v2<__nv_bfloat16, _iter>(__nv_bfloat16 * vals, \
__nv_bfloat16 * mask, \
__nv_bfloat16 * alibi, \
float layer_scale, \
bool triangular, \
bool recompute, \
bool local_attention, \
int window_size, \
int total_count, \
int heads, \
int sequence_length, \
int num_seq, \
int head_offset, \
int mask_stride, \
int mp_size, \
int reduceWidth)
#define FOREACH_ITERATIONS(cb) \
cb(1); \
cb(2); \
cb(4); \
cb(8); \
cb(16); \
cb(32); \
cb(64)
FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_HALF);
#ifdef BF16_AVAILABLE
FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_BF16);
#endif
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include "conversion_utils.h"
#include "inference_cuda_layers.h"
namespace cg = cooperative_groups;
// only used to avoid compilation error due to lack of definition.
#ifndef BF16_AVAILABLE
using __nv_bfloat162 = __half2;
#endif
// Bias add
__global__ void bias_add_transform_0213(float* output,
......@@ -74,11 +81,12 @@ __global__ void bias_add_transform_0213(float* output,
#define ATTN_H 3
#define MAX_SEQ_LINE 10
__global__ void bias_add_transform_0213(__half* output, // q
__half* k_cache,
__half* v_cache,
const __half* vals, // qkv
const __half* bias,
template <typename T>
__global__ void bias_add_transform_0213(T* output, // q
T* k_cache,
T* v_cache,
const T* vals, // qkv
const T* bias,
int hidden_dim,
int seq_length,
unsigned seq_offset,
......@@ -90,6 +98,8 @@ __global__ void bias_add_transform_0213(__half* output, // q
int head_ext,
int max_out_tokens)
{
using T2 =
typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
unsigned half_dim = (rotary_dim << 3) >> 1;
int d0_stride = hidden_dim * seq_length;
int d1_stride = hidden_dim;
......@@ -107,8 +117,8 @@ __global__ void bias_add_transform_0213(__half* output, // q
float4 vals_arr;
float4 output_arr;
__half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
__half2* output_half = reinterpret_cast<__half2*>(&output_arr);
T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
T2* output_half = reinterpret_cast<T2*>(&output_arr);
const float4* vals_vec = reinterpret_cast<const float4*>(vals);
float4* output_vec =
......@@ -128,17 +138,19 @@ __global__ void bias_add_transform_0213(__half* output, // q
int lane = d3 & 0x1f;
if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
float4 q = vals_vec[d3];
__half2* q_h = reinterpret_cast<__half2*>(&q);
T2* q_h = reinterpret_cast<T2*>(&q);
if (rotate_every_two) {
#pragma unroll
for (int o = 0; o < 4; o++) {
float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
float q_data[2];
q_data[0] = (float)q_h[o].x;
q_data[1] = (float)q_h[o].y;
q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
q_data[0] = conversion::to<float>(q_h[o].x);
q_data[1] = conversion::to<float>(q_h[o].y);
q_h[o].x = conversion::to<T>(-1.0 * q_data[1] * sinf(inv_freq) +
q_data[0] * cosf(inv_freq));
q_h[o].y =
conversion::to<T>(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
}
}
output_vec[d3] = q;
......@@ -187,16 +199,17 @@ void launch_bias_add_transform_0213<float>(float* output,
head_ext,
max_out_tokens);
}
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
T* vals,
T* vals1,
const T* vals2,
void launch_bias_add_transform_0213(T* output,
T* k_cache,
T* v_cache,
const T* vals,
const T* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int seq_length1,
int all_tokens,
int hidden_dim,
int heads,
int rotary_dim,
......@@ -204,25 +217,7 @@ void launch_bias_add_transform_0213(T* outputs,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens);
template <>
void launch_bias_add_transform_0213<__half>(__half* output,
__half* k_cache,
__half* v_cache,
const __half* vals,
const __half* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int all_tokens,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens)
int max_out_tokens)
{
hidden_dim >>= 3;
int head_ext = 1; // (hidden_dim - 1) / MAX_THREADS + 1;
......@@ -245,6 +240,44 @@ void launch_bias_add_transform_0213<__half>(__half* output,
max_out_tokens);
}
#ifdef BF16_AVAILABLE
template void launch_bias_add_transform_0213(__nv_bfloat16* output,
__nv_bfloat16* k_cache,
__nv_bfloat16* v_cache,
const __nv_bfloat16* vals,
const __nv_bfloat16* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int all_tokens,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens);
#endif
template void launch_bias_add_transform_0213(__half* output,
__half* k_cache,
__half* v_cache,
const __half* vals,
const __half* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int all_tokens,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens);
// Bias add
__global__ void pad_add_transform_0213(float* output,
......@@ -257,17 +290,20 @@ __global__ void pad_add_transform_0213(float* output,
{
}
__global__ void pad_add_transform_0213(__half* output,
const __half* vals,
template <typename T>
__global__ void pad_add_transform_0213(T* output,
const T* vals,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size)
{
using T2 =
typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
float4 ZERO;
const __half2 zero_h = __float2half2_rn(0.f);
__half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
const T2 zero_h = conversion::to<T2>(0.f);
T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
#pragma unroll
for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
......@@ -300,17 +336,6 @@ __global__ void pad_add_transform_0213(__half* output,
output_vec[d3] = ZERO;
}
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream);
// [B S C*H] - > C * [B A S N]
template <>
void launch_pad_add_transform_0213<float>(float* output,
......@@ -324,16 +349,17 @@ void launch_pad_add_transform_0213<float>(float* output,
cudaStream_t stream)
{
}
template <>
void launch_pad_add_transform_0213<__half>(__half* output,
const __half* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream)
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream)
{
hidden_dim >>= 3;
dim3 block_dim((padded_head_size >> 3), heads, 2);
......@@ -342,6 +368,28 @@ void launch_pad_add_transform_0213<__half>(__half* output,
output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
}
#ifdef BF16_AVAILABLE
template void launch_pad_add_transform_0213(__nv_bfloat16* output,
const __nv_bfloat16* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream);
#endif
template void launch_pad_add_transform_0213(__half* output,
const __half* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream);
// Bias add
template <typename T>
__global__ void bias_add_transform_0213(T* output,
......@@ -393,15 +441,17 @@ __global__ void bias_add_transform_0213<float>(float* output,
d2 * d2_out_stride + d3] = outputs;
}
template <>
__global__ void bias_add_transform_0213<__half>(__half* output,
const __half* vals,
const __half* bias,
int hidden_dim,
int seq_length,
int heads,
int head_ext)
template <typename T>
__global__ void bias_add_transform_0213(T* output,
const T* vals,
const T* bias,
int hidden_dim,
int seq_length,
int heads,
int head_ext)
{
using T2 =
typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
int d0_stride = hidden_dim * seq_length;
int d1_stride = hidden_dim;
int d2_stride = hidden_dim / heads;
......@@ -417,9 +467,9 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
float4 vals_arr;
float4 bias_arr;
float4 output_arr;
__half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
__half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
__half2* output_half = reinterpret_cast<__half2*>(&output_arr);
T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
T2* bias_half = reinterpret_cast<T2*>(&bias_arr);
T2* output_half = reinterpret_cast<T2*>(&output_arr);
const float4* vals_vec = reinterpret_cast<const float4*>(vals);
const float4* bias_vec = reinterpret_cast<const float4*>(bias);
......@@ -448,13 +498,16 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
output_vec[d3] = output_arr;
}
__global__ void bias_add_transform_0213_v2(__half* output,
const __half* vals,
const __half* bias,
template <typename T>
__global__ void bias_add_transform_0213_v2(T* output,
const T* vals,
const T* bias,
int hidden_dim,
int seq_length,
int heads)
{
using T2 =
typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
__shared__ float4 in_data[3072];
int d0_stride = hidden_dim * seq_length;
......@@ -476,9 +529,9 @@ __global__ void bias_add_transform_0213_v2(__half* output,
float4 vals_arr[1];
float4 bias_arr[1];
float4 output_arr[1];
__half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
__half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
__half2* output_half = reinterpret_cast<__half2*>(output_arr);
T2* vals_half = reinterpret_cast<T2*>(vals_arr);
T2* bias_half = reinterpret_cast<T2*>(bias_arr);
T2* output_half = reinterpret_cast<T2*>(output_arr);
const float4* vals_vec = reinterpret_cast<const float4*>(vals);
const float4* bias_vec = reinterpret_cast<const float4*>(bias);
......@@ -518,6 +571,22 @@ __global__ void bias_add_transform_0213_v2(__half* output,
}
}
template __global__ void bias_add_transform_0213_v2(__half* output,
const __half* vals,
const __half* bias,
int hidden_dim,
int seq_length,
int heads);
#ifdef BF16_AVAILABLE
template __global__ void bias_add_transform_0213_v2(__nv_bfloat16* output,
const __nv_bfloat16* vals,
const __nv_bfloat16* bias,
int hidden_dim,
int seq_length,
int heads);
#endif
template <typename T>
__global__ void transform4d_0213(T* out,
const T* in,
......@@ -559,13 +628,13 @@ __global__ void transform4d_0213<float>(float* out,
}
}
template <>
__global__ void transform4d_0213<__half>(__half* out,
const __half* in,
int heads,
int seq_length,
int hidden_dim,
int head_ext)
template <typename T>
__global__ void transform4d_0213(T* out,
const T* in,
int heads,
int seq_length,
int hidden_dim,
int head_ext)
{
int d0_stride = hidden_dim * (seq_length / head_ext);
int d1_stride = hidden_dim;
......@@ -593,11 +662,8 @@ __global__ void transform4d_0213<__half>(__half* out,
out_vec[d3] = in_vec[d3];
}
__global__ void transform4d_0213_v2(__half* out,
const __half* in,
int heads,
int seq_length,
int hidden_dim)
template <typename T>
__global__ void transform4d_0213_v2(T* out, const T* in, int heads, int seq_length, int hidden_dim)
{
__shared__ float4 in_data[3072];
......@@ -641,6 +707,20 @@ __global__ void transform4d_0213_v2(__half* out,
}
}
#ifdef BF16_AVAILABLE
template __global__ void transform4d_0213_v2(__nv_bfloat16* out,
const __nv_bfloat16* in,
int heads,
int seq_length,
int hidden_dim);
#endif
template __global__ void transform4d_0213_v2(__half* out,
const __half* in,
int heads,
int seq_length,
int hidden_dim);
// 3 * [B A S N] - > [B S C*H]
template <>
void launch_transform4d_0213<float>(float* out,
......@@ -659,20 +739,40 @@ void launch_transform4d_0213<float>(float* out,
<<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
}
template <>
void launch_transform4d_0213<__half>(__half* out,
const __half* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count)
template <typename T>
void launch_transform4d_0213<T>(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count)
{
hidden_dim >>= 3;
int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
dim3 block_dims(hidden_dim / heads, (heads / head_ext));
transform4d_0213<__half>
<<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, head_ext);
transform4d_0213<<<grid_dims, block_dims, 0, stream>>>(
out, in, heads, seq_length, hidden_dim, head_ext);
}
#ifdef BF16_AVAILABLE
template void launch_transform4d_0213(__nv_bfloat16* out,
const __nv_bfloat16* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count);
#endif
template void launch_transform4d_0213(__half* out,
const __half* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count);
#pragma once
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline int DS_GET_BLOCKS(const int N)
{
return std::max(
std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
// Use at least 1 block, since CUDA does not allow empty block
1);
}
class Context {
public:
Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
{
curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(_gen, 123);
if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
auto message = std::string("Fail to create cublas handle.");
std::cerr << message << std::endl;
throw std::runtime_error(message);
}
#ifndef __HIP_PLATFORM_HCC__
cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
cudaEventCreate(&_comp1_event, (cudaEventDisableTiming | cudaEventBlockingSync));
cudaEventCreate(&_comp2_event, (cudaEventDisableTiming | cudaEventBlockingSync));
cudaEventCreate(&_comp_event, (cudaEventDisableTiming | cudaEventBlockingSync));
cudaEventCreate(&_comm_event, (cudaEventDisableTiming | cudaEventBlockingSync));
#else
cudaEventCreate(&_comp1_event);
cudaEventCreate(&_comp2_event);
cudaEventCreate(&_comp_event);
cudaEventCreate(&_comm_event);
#endif
}
virtual ~Context()
{
cublasDestroy(_cublasHandle);
cudaFree(_workspace);
cudaEventDestroy(_comp1_event);
cudaEventDestroy(_comp2_event);
cudaEventDestroy(_comp_event);
cudaEventDestroy(_comm_event);
}
static Context& Instance()
{
static Context _ctx;
return _ctx;
}
void GenWorkSpace(size_t size)
{
if (!_workspace) {
assert(_workspace == nullptr);
cudaMalloc(&_workspace, size);
} else if (_workSpaceSize < size) {
cudaFree(_workspace);
cudaMalloc(&_workspace, size);
}
_workSpaceSize = size;
}
cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
size_t get_workspace_size() const { return _workSpaceSize; }
void* GetWorkSpace() { return _workspace; }
inline unsigned new_token(unsigned layer_id)
{
if (layer_id == 0) _token_length++;
return _token_length;
}
inline void reset_tokens(unsigned initial_tokens = 0)
{
_num_tokens = initial_tokens;
} //_token_length = 0; }
inline unsigned current_tokens() const { return _num_tokens; }
inline void advance_tokens() { _num_tokens++; }
curandGenerator_t& GetRandGenerator() { return _gen; }
cudaStream_t GetCommStream(bool async_op = false)
{
if (!_comm_stream)
_comm_stream = async_op ? at::cuda::getStreamFromPool(true)
: at::cuda::getCurrentCUDAStream();
return _comm_stream;
}
cudaStream_t GetCurrentStream(bool other_stream = false)
{
// get current pytorch stream.
if (other_stream) {
if (!_stream) _stream = at::cuda::getStreamFromPool(true);
return _stream;
}
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
return stream;
}
cublasHandle_t GetCublasHandle() { return _cublasHandle; }
std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
{
uint64_t offset = _curr_offset;
_curr_offset += offset_inc;
return std::pair<uint64_t, uint64_t>(_seed, offset);
}
void SetSeed(uint64_t new_seed) { _seed = new_seed; }
const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
inline void SynchComp()
{
cudaEventRecord(_comp_event, _comp_stream);
cudaStreamWaitEvent(_comm_stream, _comp_event, 0);
}
inline void SynchComm()
{
cudaEventRecord(_comm_event, _comm_stream);
cudaStreamWaitEvent(_comp_stream, _comm_event, 0);
}
private:
curandGenerator_t _gen;
cublasHandle_t _cublasHandle;
cudaEvent_t _comp_event;
cudaEvent_t _comm_event;
void* _workspace;
uint64_t _seed;
uint64_t _curr_offset;
size_t _workSpaceSize;
cudaEvent_t _comp1_event;
cudaEvent_t _comp2_event;
cudaStream_t _stream;
unsigned _token_length;
unsigned _num_tokens;
std::vector<std::array<int, 3>> _gemm_algos;
cudaStream_t _comp_stream;
cudaStream_t _comm_stream;
std::unordered_map<int, int> _world_sizes;
};
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
#ifdef __HIP_PLATFORM_HCC__
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_gemm_algo algo)
#else
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status = rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
rocblas_datatype_f32_r,
(transa == rocblas_operation_none) ? m : k,
(const void*)B,
rocblas_datatype_f32_r,
(transb == rocblas_operation_none) ? k : n,
(const void*)beta,
C,
rocblas_datatype_f32_r,
m,
C,
rocblas_datatype_f32_r,
m,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
CUDA_R_32F,
(transa == CUBLAS_OP_N) ? m : k,
(const void*)B,
CUDA_R_32F,
(transb == CUBLAS_OP_N) ? k : n,
(const void*)beta,
C,
CUDA_R_32F,
m,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
n,
k,
(int)status);
return EXIT_FAILURE;
}
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_gemm_algo algo)
#else
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status = rocblas_gemm_ex(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
rocblas_datatype_f16_r,
(transa == rocblas_operation_none) ? m : k,
(const void*)B,
rocblas_datatype_f16_r,
(transb == rocblas_operation_none) ? k : n,
(const void*)beta,
(void*)C,
rocblas_datatype_f16_r,
m,
(void*)C,
rocblas_datatype_f16_r,
m,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
(const void*)alpha,
(const void*)A,
CUDA_R_16F,
(transa == CUBLAS_OP_N) ? m : k,
(const void*)B,
CUDA_R_16F,
(transb == CUBLAS_OP_N) ? k : n,
(const void*)beta,
(void*)C,
CUDA_R_16F,
m,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
n,
k,
(int)status);
return EXIT_FAILURE;
}
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
rocblas_gemm_algo algo)
#else
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
cublasOperation_t op_A,
cublasOperation_t op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status =
rocblas_gemm_strided_batched_ex(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
rocblas_datatype_f32_r,
(op_A == rocblas_operation_none) ? m : k,
stride_A,
B,
rocblas_datatype_f32_r,
(op_B == rocblas_operation_none) ? k : n,
stride_B,
beta,
C,
rocblas_datatype_f32_r,
m,
stride_C,
C,
rocblas_datatype_f32_r,
m,
stride_C,
batch,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
CUDA_R_32F,
(op_A == CUBLAS_OP_N) ? m : k,
stride_A,
B,
CUDA_R_32F,
(op_B == CUBLAS_OP_N) ? k : n,
stride_B,
beta,
C,
CUDA_R_32F,
m,
stride_C,
batch,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
batch,
m,
n,
k,
(int)status);
return EXIT_FAILURE;
}
return 0;
}
#ifdef __HIP_PLATFORM_HCC__
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
rocblas_gemm_algo algo)
#else
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
cublasOperation_t op_A,
cublasOperation_t op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
rocblas_status status =
rocblas_gemm_strided_batched_ex(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
rocblas_datatype_f16_r,
(op_A == rocblas_operation_none) ? m : k,
stride_A,
B,
rocblas_datatype_f16_r,
(op_B == rocblas_operation_none) ? k : n,
stride_B,
beta,
C,
rocblas_datatype_f16_r,
m,
stride_C,
C,
rocblas_datatype_f16_r,
m,
stride_C,
batch,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
op_A,
op_B,
m,
n,
k,
alpha,
A,
CUDA_R_16F,
(op_A == CUBLAS_OP_N) ? m : k,
stride_A,
B,
CUDA_R_16F,
(op_B == CUBLAS_OP_N) ? k : n,
stride_B,
beta,
C,
CUDA_R_16F,
m,
stride_C,
batch,
CUDA_R_32F,
algo);
#endif
#ifdef __HIP_PLATFORM_HCC__
if (status != rocblas_status_success) {
#else
if (status != CUBLAS_STATUS_SUCCESS) {
#endif
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
m,
n,
k,
(int)status);
return EXIT_FAILURE;
}
return 0;
}
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
#define MAX_WARP_NUM 32
#define WARP_SIZE 32
#define SMs 80
#define MAX_REGISTERS 256
template <typename T>
void launch_attn_softmax_v2(T* vals,
T* mask,
bool triangular,
bool recompute,
bool local_attention,
int window_size,
int batch_size,
int heads,
int num_seq,
int sequence_length,
float scale,
cudaStream_t stream);
// Fused bias add with gelu activation
template <typename T>
void launch_bias_gelu(T* input,
const T* bias,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T>
void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
template <typename T>
void launch_bias_residual(T* input,
T* output,
T* attn,
T* bias,
T* attn_bias,
int batch,
int hidden_dim,
int mp_size,
cudaStream_t stream);
template <typename T>
void launch_layer_norm(T* out,
T* vals,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
cudaStream_t stream);
template <typename T>
void launch_residual_layer_norm(T* norm,
T* res_add,
T* vals,
T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
bool preLN,
bool mlp_after_attn,
cudaStream_t stream);
template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
const float* qscale,
unsigned output_size,
unsigned hidden_dim,
unsigned groups,
unsigned merge_count,
cudaStream_t stream);
template <typename T>
void launch_gptj_residual_add(T* input,
T* output,
T* attn,
T* bias,
T* attn_bias,
int batch,
int head_size,
int mp_size,
cudaStream_t stream);
template <typename T>
void launch_apply_rotary_pos_emb(T* mixed_query,
T* key_layer,
unsigned head_size,
unsigned seq_len,
unsigned rotary_dim,
unsigned offset,
unsigned num_heads,
unsigned batch,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream);
template <typename T>
void launch_moe_res_matmul(T* residual,
T* coef,
T* mlp_out,
int seq_len,
int hidden_dim,
cudaStream_t stream);
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#pragma once
......@@ -45,17 +46,20 @@ inline int DS_GET_BLOCKS(const int N)
1);
}
class Context {
class InferenceContext {
public:
Context()
InferenceContext()
: _workspace(nullptr),
_seed(42),
_curr_offset(0),
_stream(0),
_free_memory_size(0),
_num_tokens(1),
_attention_unfused_workspace_offset(0)
_attention_unfused_workspace_offset(0),
_workSpaceSize(0)
{
_workSpaceSize = 0;
_workspace = 0;
if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
auto message = std::string("Fail to create cublas handle.");
std::cerr << message << std::endl;
......@@ -70,7 +74,7 @@ public:
cudaEventCreate(&_comm_event);
}
virtual ~Context()
virtual ~InferenceContext()
{
cublasDestroy(_cublasHandle);
cudaFree(_workspace);
......@@ -80,9 +84,9 @@ public:
cudaEventDestroy(_comm_event);
}
static Context& Instance()
static InferenceContext& Instance()
{
static Context _ctx;
static InferenceContext _ctx;
return _ctx;
}
......@@ -95,7 +99,8 @@ public:
const bool& external_cache,
const size_t& elem_size,
const unsigned& rank,
unsigned max_out_tokens)
unsigned max_out_tokens,
unsigned min_out_tokens)
{
size_t total_size;
if (!_free_memory_size) { cudaMemGetInfo(&_free_memory_size, &total_size); }
......@@ -106,9 +111,9 @@ public:
const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;
size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
size_t activation_size = 10 * (num_heads * effective_head_size) * batch_size;
// Other sequence length dimension is added when the final workSpaceSize is calculated
size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
size_t temp_size = batch_size * (num_heads / mp_size) * max_out_tokens;
size_t cache_size =
num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
size_t minimal_requirements =
......@@ -128,25 +133,37 @@ public:
: (activation_size + temp_size + cache_size))) *
_max_seq_len * elem_size;
temp_size *= _max_seq_len * elem_size;
if (rank == 0 && !_workspace)
if (_max_seq_len < min_out_tokens) {
printf(
"Allocatable workspace available (%d tokens) is less than minimum requested "
"workspace (%d tokens)\n",
_max_seq_len,
min_out_tokens);
throw std::runtime_error("Workspace can't be allocated, not enough memory");
}
if (!_workspace) {
assert(_workspace == nullptr);
cudaMalloc(&_workspace, workSpaceSize);
} else if (_workSpaceSize < workSpaceSize) {
cudaFree(_workspace);
cudaMalloc(&_workspace, workSpaceSize);
}
if (rank == 0 && (!_workspace || _workSpaceSize < workSpaceSize))
printf(
"------------------------------------------------------\n"
"Free memory : %f (GigaBytes) \n"
"Total memory: %f (GigaBytes) \n"
"Requested memory: %f (GigaBytes) \n"
"Setting maximum total tokens (input + output) to %lu \n"
"WorkSpace: %p \n"
"------------------------------------------------------\n",
(float)_free_memory_size / GIGABYTE,
(float)total_size / GIGABYTE,
(float)workSpaceSize / GIGABYTE,
_max_seq_len);
if (!_workspace) {
assert(_workspace == nullptr);
cudaMalloc(&_workspace, workSpaceSize);
} else if (_workSpaceSize < workSpaceSize) {
cudaFree(_workspace);
cudaMalloc(&_workspace, workSpaceSize);
}
_max_seq_len,
_workspace);
if (!_workspace) {
printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
......@@ -202,6 +219,17 @@ public:
return stream;
}
void release_workspace()
{
cudaFree(_workspace);
_workspace = nullptr;
}
bool retake_workspace()
{
if (_workspace != nullptr || _workSpaceSize == 0) return true;
cudaMalloc(&_workspace, _workSpaceSize);
return _workspace != nullptr;
}
cublasHandle_t GetCublasHandle() { return _cublasHandle; }
std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
......
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
......@@ -105,6 +107,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
return 0;
}
template <typename T>
#ifdef __HIP_PLATFORM_HCC__
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
......@@ -114,9 +117,9 @@ int cublas_gemm_ex(rocblas_handle handle,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
const T* A,
const T* B,
T* C,
rocblas_gemm_algo algo)
#else
int cublas_gemm_ex(cublasHandle_t handle,
......@@ -127,13 +130,15 @@ int cublas_gemm_ex(cublasHandle_t handle,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
const T* A,
const T* B,
T* C,
cublasGemmAlgo_t algo)
#endif
{
#ifdef __HIP_PLATFORM_HCC__
constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
: rocblas_datatype_bf16_r;
rocblas_status status = rocblas_gemm_ex(handle,
transa,
transb,
......@@ -142,23 +147,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
k,
(const void*)alpha,
(const void*)A,
rocblas_datatype_f16_r,
rocblas_dtype_16,
(transa == rocblas_operation_none) ? m : k,
(const void*)B,
rocblas_datatype_f16_r,
rocblas_dtype_16,
(transb == rocblas_operation_none) ? k : n,
(const void*)beta,
(void*)C,
rocblas_datatype_f16_r,
rocblas_dtype_16,
m,
(void*)C,
rocblas_datatype_f16_r,
rocblas_dtype_16,
m,
rocblas_datatype_f32_r,
algo,
0,
0);
#else
constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
cublasStatus_t status = cublasGemmEx(handle,
transa,
transb,
......@@ -167,14 +173,14 @@ int cublas_gemm_ex(cublasHandle_t handle,
k,
(const void*)alpha,
(const void*)A,
CUDA_R_16F,
cublas_dtype_16,
(transa == CUBLAS_OP_N) ? m : k,
(const void*)B,
CUDA_R_16F,
cublas_dtype_16,
(transb == CUBLAS_OP_N) ? k : n,
(const void*)beta,
(void*)C,
CUDA_R_16F,
cublas_dtype_16,
m,
CUDA_R_32F,
algo);
......@@ -306,6 +312,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
return 0;
}
template <typename T>
#ifdef __HIP_PLATFORM_HCC__
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
......@@ -313,9 +320,9 @@ int cublas_strided_batched_gemm(rocblas_handle handle,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
const T* A,
const T* B,
T* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
......@@ -330,9 +337,9 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
const T* A,
const T* B,
T* C,
cublasOperation_t op_A,
cublasOperation_t op_B,
int stride_A,
......@@ -343,6 +350,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
#endif
{
#ifdef __HIP_PLATFORM_HCC__
constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
: rocblas_datatype_bf16_r;
rocblas_status status =
rocblas_gemm_strided_batched_ex(handle,
op_A,
......@@ -352,20 +361,20 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
k,
alpha,
A,
rocblas_datatype_f16_r,
rocblas_dtype_16,
(op_A == rocblas_operation_none) ? m : k,
stride_A,
B,
rocblas_datatype_f16_r,
rocblas_dtype_16,
(op_B == rocblas_operation_none) ? k : n,
stride_B,
beta,
C,
rocblas_datatype_f16_r,
rocblas_dtype_16,
m,
stride_C,
C,
rocblas_datatype_f16_r,
rocblas_dtype_16,
m,
stride_C,
batch,
......@@ -374,6 +383,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
0,
0);
#else
constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
op_A,
op_B,
......@@ -382,16 +392,16 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
k,
alpha,
A,
CUDA_R_16F,
cublas_dtype_16,
(op_A == CUBLAS_OP_N) ? m : k,
stride_A,
B,
CUDA_R_16F,
cublas_dtype_16,
(op_B == CUBLAS_OP_N) ? k : n,
stride_B,
beta,
C,
CUDA_R_16F,
cublas_dtype_16,
m,
stride_C,
batch,
......
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#pragma once
#include "ds_kernel_utils.h"
#include <cuda.h>
#ifdef BF16_AVAILABLE
#include <cuda_bf16.h>
#endif
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
......
/*
Copyright The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#include "custom_cuda_layers.h"
......
/*
Copyright The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#include <math.h>
#include "custom_cuda_layers.h"
......
/*
Copyright The Microsoft DeepSpeed Team
*/
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
#include "custom_cuda_layers.h"
......
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <torch/csrc/utils/tensor_flatten.h>
......
ref: refs/heads/ds-v0.8.2-rocm
[core]
repositoryformatversion = 0
filemode = true
bare = true
[remote "origin"]
url = http://developer.hpccube.com/codes/aicomponent/deepspeed.git
fetch = +refs/*:refs/*
mirror = true
Unnamed repository; edit this file 'description' to name the repository.
#!/bin/sh
#
# An example hook script to check the commit log message taken by
# applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit. The hook is
# allowed to edit the commit message file.
#
# To enable this hook, rename this file to "applypatch-msg".
. git-sh-setup
test -x "$GIT_DIR/hooks/commit-msg" &&
exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"}
:
#!/bin/sh
#
# An example hook script to check the commit log message.
# Called by "git commit" with one argument, the name of the file
# that has the commit message. The hook should exit with non-zero
# status after issuing an appropriate message if it wants to stop the
# commit. The hook is allowed to edit the commit message file.
#
# To enable this hook, rename this file to "commit-msg".
# Uncomment the below to add a Signed-off-by line to the message.
# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
# hook is more suited to it.
#
# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
# This example catches duplicate Signed-off-by lines.
test "" = "$(grep '^Signed-off-by: ' "$1" |
sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
echo >&2 Duplicate Signed-off-by lines.
exit 1
}
#!/bin/sh
#
# An example hook script to prepare a packed repository for use over
# dumb transports.
#
# To enable this hook, rename this file to "post-update".
exec git update-server-info
#!/bin/sh
#
# An example hook script to verify what is about to be committed
# by applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-applypatch".
. git-sh-setup
test -x "$GIT_DIR/hooks/pre-commit" &&
exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"}
:
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git commit" with no arguments. The hook should
# exit with non-zero status after issuing an appropriate message if
# it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-commit".
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
fi
# If you want to allow non-ascii filenames set this variable to true.
allownonascii=$(git config hooks.allownonascii)
# Redirect output to stderr.
exec 1>&2
# Cross platform projects tend to avoid non-ascii filenames; prevent
# them from being added to the repository. We exploit the fact that the
# printable range starts at the space character and ends with tilde.
if [ "$allownonascii" != "true" ] &&
# Note that the use of brackets around a tr range is ok here, (it's
# even required, for portability to Solaris 10's /usr/bin/tr), since
# the square bracket bytes happen to fall in the designated range.
test $(git diff --cached --name-only --diff-filter=A -z $against |
LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
then
echo "Error: Attempt to add a non-ascii file name."
echo
echo "This can cause problems if you want to work"
echo "with people on other platforms."
echo
echo "To be portable it is advisable to rename the file ..."
echo
echo "If you know what you are doing you can disable this"
echo "check using:"
echo
echo " git config hooks.allownonascii true"
echo
exit 1
fi
# If there are whitespace errors, print the offending file names and fail.
exec git diff-index --check --cached $against --
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment