Unverified Commit 981a4610 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

[Fix] Remove unused code to reduce binary size (#181)

* clean-up

* fix lint

* fix lint
parent 83697422
......@@ -299,21 +299,16 @@ endif()
########################################
add_library(transformer-shared SHARED
$<TARGET_OBJECTS:BaseBeamSearchLayer>
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:BeamSearchLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:OnlineBeamSearchLayer>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
$<TARGET_OBJECTS:activation_kernels>
$<TARGET_OBJECTS:ban_bad_words>
$<TARGET_OBJECTS:beam_search_penalty_kernels>
$<TARGET_OBJECTS:beam_search_topk_kernels>
$<TARGET_OBJECTS:bert_preprocess_kernels>
$<TARGET_OBJECTS:cublasAlgoMap>
$<TARGET_OBJECTS:cublasMMWrapper>
......@@ -329,7 +324,6 @@ add_library(transformer-shared SHARED
$<TARGET_OBJECTS:mpi_utils>
$<TARGET_OBJECTS:nccl_utils>
$<TARGET_OBJECTS:nvtx_utils>
$<TARGET_OBJECTS:online_softmax_beamsearch_kernels>
$<TARGET_OBJECTS:sampling_penalty_kernels>
$<TARGET_OBJECTS:sampling_topk_kernels>
$<TARGET_OBJECTS:sampling_topp_kernels>
......
......@@ -26,11 +26,6 @@ add_library(activation_kernels STATIC activation_kernels.cu)
set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gen_relative_pos_bias STATIC gen_relative_pos_bias.cu)
set_property(TARGET gen_relative_pos_bias PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gen_relative_pos_bias PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gen_relative_pos_bias PUBLIC activation_kernels)
add_library(logprob_kernels STATIC logprob_kernels.cu)
set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
......@@ -51,10 +46,6 @@ add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead
set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(online_softmax_beamsearch_kernels STATIC online_softmax_beamsearch_kernels.cu)
set_property(TARGET online_softmax_beamsearch_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET online_softmax_beamsearch_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_kernels STATIC decoding_kernels.cu)
set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
......@@ -63,15 +54,6 @@ add_library(gpt_kernels STATIC gpt_kernels.cu)
set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(beam_search_penalty_kernels STATIC beam_search_penalty_kernels.cu)
set_property(TARGET beam_search_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET beam_search_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(beam_search_penalty_kernels PRIVATE cuda_utils)
add_library(beam_search_topk_kernels STATIC beam_search_topk_kernels.cu)
set_property(TARGET beam_search_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET beam_search_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
......
......@@ -306,17 +306,17 @@ void invokeGenericActivation(T* out,
const int seq_len, \
cudaStream_t stream);
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
#endif
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
#endif
// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
// #ifdef ENABLE_BF16
// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
// #endif
// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
// #ifdef ENABLE_BF16
// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
// #endif
INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float);
INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
......@@ -324,335 +324,4 @@ INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16);
#endif
INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, float);
INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, half, half);
INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, half);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, __nv_bfloat16, __nv_bfloat16);
INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, __nv_bfloat16);
#endif
#undef INSTANCIATE_GENERIC_ACTIVATION
template<typename T>
__global__ void add_bias_tanh(T* out, const T* __restrict bias, int m, int n)
{
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
T val = out[id];
if (bias != nullptr) {
val = val + ldg(&bias[id % n]);
}
out[id] = tanhf(val);
}
}
template<>
__global__ void add_bias_tanh(half* out, const half* __restrict bias, int m, int n)
{
half2* out_ptr = (half2*)out;
const half2* bias_ptr = (half2*)bias;
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
half2 val = out_ptr[id];
if (bias != nullptr) {
val = val + __ldg(&bias_ptr[id % n]);
}
val.x = tanhf(val.x);
val.y = tanhf(val.y);
out_ptr[id] = val;
}
}
#ifdef ENABLE_BF16
template<>
__global__ void add_bias_tanh(__nv_bfloat16* out, const __nv_bfloat16* __restrict bias, int m, int n)
{
__nv_bfloat162* out_ptr = (__nv_bfloat162*)out;
const __nv_bfloat162* bias_ptr = (__nv_bfloat162*)bias;
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
__nv_bfloat162 val = out_ptr[id];
if (bias != nullptr) {
val = bf16hadd2(val, ldg(&bias_ptr[id % n]));
}
val.x = tanhf(val.x);
val.y = tanhf(val.y);
out_ptr[id] = val;
}
}
#endif
template<typename T>
void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream)
{
const int data_type_factor = 4 / sizeof(T); // 1 for fp32, 2 for fp16 and bf16
dim3 block, grid;
if (n / 4 / data_type_factor <= 1024) {
block.x = n / 4 / data_type_factor;
grid.x = m;
}
else {
block.x = 1024;
grid.x = ceil(m * n / 1024.);
}
add_bias_tanh<T><<<grid, block, 0, stream>>>(out, bias, m, n / data_type_factor);
}
template void invokeAddBiasTanh(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
template void invokeAddBiasTanh(half* out, const half* bias, const int m, const int n, cudaStream_t stream);
#ifdef ENABLE_BF16
template void
invokeAddBiasTanh(__nv_bfloat16* out, const __nv_bfloat16* bias, const int m, const int n, cudaStream_t stream);
#endif
template<typename T2, int N>
__global__ void addBiasGeluV2(T2* out,
const T2* __restrict bias,
const int* ia3_tasks,
const T2* ia3_weights,
const int size,
const int* padding_offset,
const int seq_len)
{
const bool with_ia3 = ia3_tasks != nullptr;
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
T2 val = out[id];
if (bias != nullptr) {
T2 reg_bias = ldg(&bias[id % N]);
val = hadd2(val, reg_bias);
}
val = GeluActivation<T2>::apply(val);
if (with_ia3) {
const int word_id = id / N;
const int offset = padding_offset == nullptr ? 0 : padding_offset[word_id];
const int batch_id = (word_id + offset) / seq_len;
const int task = ia3_tasks[batch_id];
val = val * ia3_weights[task * N + (id % N)];
}
out[id] = val;
}
}
template<typename T2, int N, int ELEMENT_PER_ROUND>
__global__ void addBiasGeluV3(T2* out,
const T2* __restrict bias,
const int* ia3_tasks,
const T2* ia3_weights,
const int size,
const int* padding_offset,
const int seq_len)
{
const bool with_ia3 = ia3_tasks != nullptr;
T2 buffer[ELEMENT_PER_ROUND];
T2 tmp_bias[ELEMENT_PER_ROUND];
for (int id = blockIdx.x * blockDim.x * ELEMENT_PER_ROUND + threadIdx.x * ELEMENT_PER_ROUND; id < size;
id += blockDim.x * gridDim.x * ELEMENT_PER_ROUND) {
#pragma unroll
for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
buffer[i] = out[id + i];
if (bias != nullptr) {
tmp_bias[i] = ldg(&bias[(id + i) % N]);
}
}
#pragma unroll
for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
if (bias != nullptr) {
buffer[i] = hadd2(buffer[i], tmp_bias[i]);
}
buffer[i] = GeluActivation<T2>::apply(buffer[i]);
if (with_ia3) {
const int word_id = (id + i) / N;
const int offset = padding_offset == nullptr ? 0 : padding_offset[word_id];
const int batch_id = (word_id + offset) / seq_len;
const int task = ia3_tasks[batch_id];
buffer[i] = buffer[i] * ia3_weights[task * N + ((id + i) % N)];
}
out[id + i] = buffer[i];
}
}
}
#define ADD_BIAS_GELU(HALF_N, ELEMENT_PER_ROUND) \
case HALF_N: \
if (ELEMENT_PER_ROUND > 1) { \
grid.x = grid.x / ELEMENT_PER_ROUND; \
addBiasGeluV3<T2, HALF_N, ELEMENT_PER_ROUND><<<grid, block, 0, stream>>>( \
(T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len); \
} \
else { \
addBiasGeluV2<T2, HALF_N><<<grid, block, 0, stream>>>( \
(T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len); \
} \
break;
template<typename T>
void invokeAddBiasGeluV2(T* out,
const T* bias,
const int* ia3_tasks,
const T* ia3_weights,
const int* padding_offset,
const int seq_len,
const int m,
const int n,
cudaStream_t stream)
{
if (n % 2 == 0 && sizeof(T) == 2) {
const int half_n = n / 2;
dim3 block, grid;
block.x = std::min(half_n, 512);
grid.x = (m * half_n + (block.x - 1)) / block.x;
using T2 = typename TypeConverter<T>::Type;
if (grid.x >= 512) {
switch (half_n) {
ADD_BIAS_GELU(256, 1)
ADD_BIAS_GELU(512, 1)
ADD_BIAS_GELU(1024, 1)
ADD_BIAS_GELU(1536, 1)
ADD_BIAS_GELU(2048, 1)
ADD_BIAS_GELU(4096, 2)
ADD_BIAS_GELU(8192, 2)
ADD_BIAS_GELU(16384, 2)
ADD_BIAS_GELU(24576, 2)
ADD_BIAS_GELU(40960, 4)
default:
invokeGenericActivation<GeluActivation>(out,
bias,
(T*)nullptr,
(T*)nullptr,
ia3_tasks,
ia3_weights,
m,
n,
0,
(float*)nullptr,
(float*)nullptr,
padding_offset,
seq_len,
stream);
break;
}
}
else {
switch (half_n) {
ADD_BIAS_GELU(256, 1)
ADD_BIAS_GELU(512, 1)
ADD_BIAS_GELU(1024, 1)
ADD_BIAS_GELU(1536, 1)
ADD_BIAS_GELU(2048, 1)
ADD_BIAS_GELU(4096, 1)
ADD_BIAS_GELU(8192, 2)
ADD_BIAS_GELU(16384, 2)
ADD_BIAS_GELU(24576, 2)
ADD_BIAS_GELU(40960, 2)
default:
invokeGenericActivation<GeluActivation>(out,
bias,
(T*)nullptr,
(T*)nullptr,
ia3_tasks,
ia3_weights,
m,
n,
0,
(float*)nullptr,
(float*)nullptr,
padding_offset,
seq_len,
stream);
break;
}
}
}
else {
invokeGenericActivation<GeluActivation>(out,
bias,
(T*)nullptr,
(T*)nullptr,
ia3_tasks,
ia3_weights,
m,
n,
0,
(float*)nullptr,
(float*)nullptr,
padding_offset,
seq_len,
stream);
}
}
#undef ADD_BIAS_GELU
template void invokeAddBiasGeluV2(float* out,
const float* bias,
const int* ia3_tasks,
const float* ia3_weights,
const int* padding_offset,
const int seq_len,
const int m,
const int n,
cudaStream_t stream);
template void invokeAddBiasGeluV2(half* out,
const half* bias,
const int* ia3_tasks,
const half* ia3_weights,
const int* padding_offset,
const int seq_len,
const int m,
const int n,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeAddBiasGeluV2(__nv_bfloat16* out,
const __nv_bfloat16* bias,
const int* ia3_tasks,
const __nv_bfloat16* ia3_weights,
const int* padding_offset,
const int seq_len,
const int m,
const int n,
cudaStream_t stream);
#endif // ENABLE_BF16
template<typename T>
__global__ void sigmoid_kernel(T* data, const int size, const float scale)
{
const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (index < size) {
float val = cuda_cast<float>(data[index]);
val = 1.0f / (1.0f + exp(-val)) * scale;
data[index] = T(val);
}
}
template<>
__global__ void sigmoid_kernel(half2* data, const int size, const float scale)
{
const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (index < size / 2) {
half2 val = data[index];
float2 val_float2 = cuda_cast<float2>(val);
val_float2.x = 1.0f / (1.0f + exp(-val_float2.x)) * scale;
val_float2.y = 1.0f / (1.0f + exp(-val_float2.y)) * scale;
data[index] = cuda_cast<half2>(val_float2);
}
}
template<typename T>
void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream)
{
if (std::is_same<T, float>::value || (size % 2 != 0)) {
dim3 block(128);
dim3 grid((size + 127) / 128);
sigmoid_kernel<<<grid, block, 0, stream>>>(data, size, scale);
}
else {
dim3 block(128);
dim3 grid((size + 255) / 256);
sigmoid_kernel<<<grid, block, 0, stream>>>((half2*)data, size, scale);
}
}
template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
namespace turbomind {
template<typename T>
__global__ void add_bias_temperature(T* logits,
const T* bias,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const float temperature)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bbid = blockIdx.y;
logits += bbid * vocab_size_padded;
const T MASK_VAL = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
const T inv_temp = static_cast<T>(1.0f / (temperature + 1e-6f));
for (int i = tid + bid * blockDim.x; i < vocab_size_padded; i += blockDim.x * gridDim.x) {
if (i < vocab_size) {
T bias_val = bias == nullptr ? (T)(0.0f) : bias[i];
logits[i] = (logits[i] + bias_val) * inv_temp;
}
else {
logits[i] = MASK_VAL;
}
}
}
template<>
__global__ void add_bias_temperature(half2* logits,
const half2* bias,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const float temperature)
{
assert(vocab_size % 2 == 0);
assert(vocab_size_padded % 2 == 0);
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int bbid = blockIdx.y;
const half2 mask_val = __float2half2_rn(-HALF_FLT_MAX);
const half2 inv_temp = __float2half2_rn(1.0f / (temperature + 1e-6f));
const int half_vocab_size = vocab_size / 2;
const int half_vocab_size_padded = vocab_size_padded / 2;
logits += bbid * half_vocab_size_padded;
for (int index = tid + bid * blockDim.x; index < half_vocab_size_padded; index += blockDim.x * gridDim.x) {
int vocab_idx = index % half_vocab_size_padded;
half2 logit = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
if (vocab_idx < half_vocab_size) {
if (bias != nullptr) {
logit = __hadd2(logit, bias[vocab_idx]);
}
logit = __hmul2(logit, inv_temp);
}
logits[index] = logit;
}
}
template<typename T, bool IS_ADDITIVE>
__global__ void apply_repetition_penalty(T* logits,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int step,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int max_input_length,
const float repetition_penalty)
{
assert(step > 0);
const int tid = threadIdx.x;
const int bbid = blockIdx.x;
const int batch_id = bbid / beam_width;
const int bbsize = batch_size * beam_width;
logits += bbid * vocab_size_padded;
extern __shared__ char sbuf[];
T* penalty_logits = reinterpret_cast<T*>(sbuf);
// prevent misaligment when sizeof(T) = 2
int* penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * step + 31) / 32 * 32);
const int input_length = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
if (tid == 0) {
T repet_penalty = static_cast<T>(repetition_penalty);
int prev_id = current_ids[bbid];
T prev_logit = logits[prev_id];
penalty_indices[step - 1] = prev_id;
if (IS_ADDITIVE) {
penalty_logits[step - 1] = prev_logit - repet_penalty;
}
else {
penalty_logits[step - 1] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
}
if (step > 1) {
int parent_beam = bbid % beam_width;
for (int i = step - 2; i >= 0; --i) {
// Skip the padded tokens.
if (i >= input_length && i < max_input_length) {
continue;
}
parent_beam = parent_ids[i * bbsize + batch_id * beam_width + parent_beam];
prev_id = previous_ids[i * bbsize + batch_id * beam_width + parent_beam];
prev_logit = logits[prev_id];
penalty_indices[i] = prev_id;
if (IS_ADDITIVE) {
penalty_logits[i] = prev_logit - repet_penalty;
}
else {
penalty_logits[i] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
}
}
}
}
__syncthreads();
for (int i = tid; i < step; i += blockDim.x) {
if (i >= input_length && i < max_input_length) {
continue;
}
logits[penalty_indices[i]] = penalty_logits[i];
}
}
template<typename T>
__global__ void apply_min_length_penalty(T* logits,
const int min_length,
const int* end_ids,
const int* sequence_lengths,
const int max_input_length,
const int beam_width,
const int vocab_size_padded)
{
int bbid = threadIdx.x + blockIdx.x * blockDim.x; // batch-beam index
int bid = bbid / beam_width; // batch index
// We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
// which is equal to the length of k/v caches.
if (sequence_lengths[bbid] + 1 - max_input_length < min_length) {
T mask_val = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
}
}
template<typename T>
void invokeAddBiasApplyPenalties(int step,
T* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const T* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream)
{
if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) {
dim3 block(512);
if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) {
dim3 grid((vocab_size_padded / 2 + block.x - 1) / block.x, beam_width * local_batch_size);
add_bias_temperature<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
reinterpret_cast<const half2*>(bias),
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
temperature);
}
else {
dim3 grid((vocab_size_padded + block.x - 1) / block.x, beam_width * local_batch_size);
add_bias_temperature<<<grid, block, 0, stream>>>(
logits, bias, batch_size, beam_width, vocab_size, vocab_size_padded, temperature);
}
}
if (repetition_penalty_type != RepetitionPenaltyType::None && step > 0) {
if (repetition_penalty != getDefaultPenaltyValue(repetition_penalty_type)) {
size_t smem_size = (sizeof(T) * step + 31) / 32 * 32 + sizeof(int) * step;
dim3 block(256);
dim3 grid(beam_width * local_batch_size);
if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) {
apply_repetition_penalty<T, false>
<<<grid, block, smem_size, stream>>>(logits,
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
step,
current_ids,
previous_ids,
// TODO(jaedeokk):
// Remove (+ite ...) by getting parent_ids with offset
// and then remove 'ite' argument from the function.
parent_ids + ite * beam_width * local_batch_size,
input_lengths,
max_input_length,
repetition_penalty);
}
else if (repetition_penalty_type == RepetitionPenaltyType::Additive) {
apply_repetition_penalty<T, true>
<<<grid, block, smem_size, stream>>>(logits,
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
step,
current_ids,
previous_ids,
parent_ids + ite * beam_width * local_batch_size,
input_lengths,
max_input_length,
repetition_penalty);
}
}
}
if (step - max_input_length < min_length) {
FT_CHECK_WITH_INFO(sequence_lengths != nullptr, "Need sequence_lengths to apply min length penlaty");
FT_CHECK_WITH_INFO(end_ids != nullptr, "Need end_id to apply min length penlaty");
const int block_size = min(local_batch_size * beam_width, 1024);
const int grid_size = (local_batch_size * beam_width + block_size - 1) / block_size;
apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
}
}
template void invokeAddBiasApplyPenalties(int step,
float* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const float* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
template void invokeAddBiasApplyPenalties(int step,
half* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const half* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_fp16.h>
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
template<typename T>
void invokeAddBiasApplyPenalties(int step,
T* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const T* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace turbomind {
template<typename T>
__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
{
// score = log(prob) / (length)^length_penalty.
if (length_penalty == 0.0f || length == 1) {
return log_prob;
}
return log_prob / static_cast<T>(powf((float)length, length_penalty));
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_topK_kernel(const T* log_probs,
int* topk_tmp_id_buf,
T* topk_tmp_val_buf,
const bool* finished,
const int* sequence_lengths,
const int vocab_size,
T diversity_rate,
float length_penalty)
{
typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int thread_id = threadIdx.x;
int block_id = blockIdx.x; // batch beam index.
TopK<T, MAX_K> partial;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
#pragma unroll
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -MAX_T_VAL;
}
#pragma unroll
for (int elem_id = thread_id; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE) {
int index = elem_id + block_id * vocab_size;
T score = length_penalty == 0.0f ? log_probs[index] :
apply_length_penalty(log_probs[index],
finished[block_id] ? sequence_lengths[block_id] :
sequence_lengths[block_id] + 1,
length_penalty);
partial.insert(score, index);
}
TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
if (thread_id == 0) {
int index = block_id * MAX_K;
#pragma unroll
for (int i = 0; i < MAX_K; ++i) {
topk_tmp_id_buf[index + i] = total.p[i];
topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
}
}
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__
void batch_topK_kernel(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
{
int thread_id = threadIdx.x;
int block_id = blockIdx.x;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
TopK<T, MAX_K> partial;
if (thread_id == 0) {
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -MAX_T_VAL;
}
int index = block_id * MAX_K * MAX_K;
for (int i = 0; i < MAX_K * MAX_K; i++) {
partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
}
index = block_id * MAX_K;
for (int i = 0; i < MAX_K; i++) {
id_buf[index + i] = partial.p[i];
}
}
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__
void batch_topK_kernel_v2(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
{
typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int tid = threadIdx.x;
int bid = blockIdx.x;
TopK<T, MAX_K> partial;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
#pragma unroll
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -MAX_T_VAL;
}
int ite = MAX_K * MAX_K / THREADBLOCK_SIZE;
#pragma unroll
for (int i = 0; i < ite; i++) {
int index = bid * MAX_K * MAX_K + i * THREADBLOCK_SIZE + tid;
partial.insert((T)topk_tmp_val_buf[index], topk_tmp_id_buf[index]);
}
TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
if (tid == 0) {
#pragma unroll
for (int i = 0; i < MAX_K; i++) {
id_buf[bid * MAX_K + i] = total.p[i];
}
}
}
template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
__global__ void topk_stage_1_opt3(const T* __restrict log_probs,
T* tmp_log_probs,
int* topk_tmp_id_buf,
T* topk_tmp_val_buf,
const bool* finished,
const int* sequence_lengths,
const int k,
const int vocab_size,
const float length_penalty,
const int* end_ids)
{
typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int row_id = bid / BLOCKS_PER_BEAM_; // row id for log_probs (batchbeam index)
const int block_lane = bid % BLOCKS_PER_BEAM_; // block id for a beam
const int tmp_log_buf_index = row_id * vocab_size;
const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM_ * k + block_lane * k;
TopK_2<T> partial;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
if (finished != nullptr && finished[row_id] == true) {
if (tid < k) {
const int index = tmp_topk_buf_index + tid;
if (block_lane == 0 && tid == 0) {
const int end_id = end_ids[row_id / k];
topk_tmp_id_buf[index] = tmp_log_buf_index + end_id;
topk_tmp_val_buf[index] = log_probs[tmp_log_buf_index + end_id];
}
else {
topk_tmp_id_buf[index] = -1;
topk_tmp_val_buf[index] = -MAX_T_VAL;
}
}
return;
}
for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
int index = elem_id + tmp_log_buf_index;
tmp_log_probs[index] = log_probs[index];
}
for (int ite = 0; ite < k; ite++) {
partial.init();
#pragma unroll
for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
int index = elem_id + tmp_log_buf_index;
partial.insert(tmp_log_probs[index], index);
}
TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
if (tid == 0) {
const int index = tmp_topk_buf_index + ite;
topk_tmp_id_buf[index] = total.p;
topk_tmp_val_buf[index] = total.u;
tmp_log_probs[total.p] = -MAX_T_VAL;
}
__syncthreads();
}
}
template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
__global__ void topk_stage_2_opt3(const int* __restrict topk_tmp_id_buf,
T* topk_tmp_val_buf,
int* ids,
BeamHypotheses beam_hyps,
const int* end_ids,
const int vocab_size,
const int k)
{
const int size = k * k * BLOCKS_PER_BEAM_;
const int tid = threadIdx.x;
const int batch_id = blockIdx.x;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
extern __shared__ char array[];
T* s_val = topk_tmp_val_buf + batch_id * size;
int* s_id = (int*)(array);
__shared__ int selected_beams;
__shared__ bool is_stop;
if (tid == 0) {
selected_beams = 0;
is_stop = false;
}
__syncthreads();
if (beam_hyps.num_beams != nullptr) {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
if (beam_hyps.num_beams[global_batch_idx] == 0 && tid == 0) {
// initialize the buffer
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
}
else if (beam_hyps.num_beams[global_batch_idx] == k) {
return;
}
}
TopK_2<T> partial;
// In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
// is 2*k here
for (int ite = 0; ite < 2 * k; ite++) {
partial.init();
#pragma unroll
for (int i = tid; i < size; i += BLOCK_SIZE_) {
partial.insert(s_val[i], i);
}
TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
if (tid == 0) {
if (beam_hyps.num_beams != nullptr
&& topk_tmp_id_buf[batch_id * size + total.p] % vocab_size == end_ids[batch_id]) {
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if (ite >= k) {
s_val[total.p] = -MAX_T_VAL;
}
else {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
const float normed_score =
apply_length_penalty(s_val[total.p], beam_hyps.step, beam_hyps.length_penalty);
const int num_beam = beam_hyps.num_beams[global_batch_idx];
int beam_idx = num_beam;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if (num_beam == k) {
if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
// end the tracing and exist this for loop
selected_beams = k;
is_stop = true;
break;
}
else {
// find the beam index which's score = min_normed_score, erase it.
for (int j = 0; j < k; j++) {
if (beam_hyps.normed_scores[global_batch_idx * k + j]
== beam_hyps.min_normed_scores[global_batch_idx]) {
beam_idx = j;
beam_hyps.num_beams[global_batch_idx]--;
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
beam_hyps.normed_scores[global_batch_idx * k + j] = normed_score;
for (int l = 0; l < k; l++) {
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx],
beam_hyps.normed_scores[global_batch_idx * k + l]);
}
break;
}
}
}
}
const int tgt_id_offset = ((batch_id + beam_hyps.ite * beam_hyps.local_batch_size) * k + beam_idx)
* (beam_hyps.max_seq_len);
beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = end_ids[batch_id];
int prev_id = (topk_tmp_id_buf[batch_id * size + total.p] / vocab_size) % k;
for (int j = beam_hyps.step - 1; j >= 0; j--) {
const int src_idx = j * beam_hyps.batch_size * k
+ beam_hyps.ite * beam_hyps.local_batch_size * k + batch_id * k + prev_id;
beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
prev_id = beam_hyps.parent_ids_src[src_idx];
}
const int tgt_beam_idx = global_batch_idx * k + beam_idx;
beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
beam_hyps.normed_scores[tgt_beam_idx] = normed_score;
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
s_val[total.p] = -MAX_T_VAL;
beam_hyps.num_beams[global_batch_idx]++;
}
}
else {
s_id[selected_beams] = total.p;
s_val[total.p] = -MAX_T_VAL;
selected_beams++;
}
}
__syncthreads();
if (selected_beams >= k) {
break;
}
}
if (tid < k && is_stop == false) {
ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
}
}
template<typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
__global__ void topk_stage_1_opt2_general(const T* __restrict log_probs,
T* tmp_log_probs,
int* topk_tmp_id_buf,
T* topk_tmp_val_buf,
const bool* finished,
const int* sequence_lengths,
const int k,
const int vocab_size,
const float length_penalty)
{
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int row_id = bid / BLOCKS_PER_BEAM; // row id for log_probs
const int block_lane = bid % BLOCKS_PER_BEAM; // block id for a beam
const int tmp_log_buf_index = row_id * vocab_size;
const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM * k + block_lane * k;
TopK_2<T> partial;
for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size; elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
int index = elem_id + tmp_log_buf_index;
tmp_log_probs[index] = log_probs[index];
}
for (int ite = 0; ite < k; ite++) {
partial.init();
#pragma unroll
for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size;
elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
int index = elem_id + tmp_log_buf_index;
partial.insert(tmp_log_probs[index], index);
}
TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
if (tid == 0) {
const int index = tmp_topk_buf_index + ite;
topk_tmp_id_buf[index] = total.p;
topk_tmp_val_buf[index] = total.u;
tmp_log_probs[total.p] = -MAX_T_VAL;
}
__syncthreads();
}
}
template<typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
__global__ void topk_stage_2_opt2_general(const int* __restrict topk_tmp_id_buf,
T* topk_tmp_val_buf,
int* ids,
BeamHypotheses beam_hyps,
const int* end_ids,
const int k,
const int vocab_size)
{
const int size = k * k * BLOCKS_PER_BEAM;
const int tid = threadIdx.x;
const int batch_id = blockIdx.x;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
extern __shared__ char array[];
T* s_val = topk_tmp_val_buf + batch_id * size;
int* s_id = (int*)(array);
__shared__ int selected_beams;
__shared__ bool is_stop;
if (tid == 0) {
selected_beams = 0;
is_stop = false;
}
__syncthreads();
if (beam_hyps.num_beams != nullptr) {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
if (beam_hyps.num_beams[global_batch_idx] == 0 && tid == 0) {
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
}
else if (beam_hyps.num_beams[global_batch_idx] == k) {
return;
}
}
TopK_2<T> partial;
// In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
// is 2*k here
for (int ite = 0; ite < 2 * k; ite++) {
partial.init();
#pragma unroll
for (int i = tid; i < size; i += BLOCK_SIZE) {
partial.insert(s_val[i], i);
}
TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
if (tid == 0) {
if (beam_hyps.num_beams != nullptr
&& topk_tmp_id_buf[batch_id * size + total.p] % vocab_size == end_ids[batch_id]) {
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if (ite >= k) {
s_val[total.p] = -MAX_T_VAL;
}
else {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
const float normed_score =
apply_length_penalty(s_val[total.p], beam_hyps.step, beam_hyps.length_penalty);
const int num_beam = beam_hyps.num_beams[global_batch_idx];
int beam_idx = num_beam;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if (num_beam == k) {
if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
// end the tracing and exist this for loop
selected_beams = k;
is_stop = true;
break;
}
else {
// find the beam index which's score = min_normed_score, erase it.
for (int j = 0; j < k; j++) {
if (beam_hyps.normed_scores[global_batch_idx * k + j]
== beam_hyps.min_normed_scores[global_batch_idx]) {
beam_idx = j;
beam_hyps.num_beams[global_batch_idx]--;
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
beam_hyps.normed_scores[global_batch_idx * k + j] = normed_score;
for (int l = 0; l < k; l++) {
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx],
beam_hyps.normed_scores[global_batch_idx * k + l]);
}
break;
}
}
}
}
const int tgt_id_offset = ((batch_id + beam_hyps.ite * beam_hyps.local_batch_size) * k + beam_idx)
* (beam_hyps.max_seq_len);
beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = end_ids[batch_id];
int prev_id = (topk_tmp_id_buf[batch_id * size + total.p] / vocab_size) % k;
for (int j = beam_hyps.step - 1; j >= 0; j--) {
const int src_idx = j * beam_hyps.batch_size * k
+ beam_hyps.ite * beam_hyps.local_batch_size * k + batch_id * k + prev_id;
beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
prev_id = beam_hyps.parent_ids_src[src_idx];
}
const int tgt_beam_idx = global_batch_idx * k + beam_idx;
beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
beam_hyps.normed_scores[tgt_beam_idx] = normed_score;
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
s_val[total.p] = -MAX_T_VAL;
beam_hyps.num_beams[global_batch_idx]++;
}
}
else {
s_id[selected_beams] = total.p;
s_val[total.p] = -MAX_T_VAL;
selected_beams++;
}
}
__syncthreads();
if (selected_beams >= k) {
break;
}
}
if (tid < k && is_stop == false) {
ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
}
}
#define CASE_K_DIV(K, BLOCK_SIZE_1, BLOCK_SIZE_2) \
case K: \
beam_topK_kernel<T, K, BLOCK_SIZE_2><<<batch_size * beam_width, BLOCK_SIZE_2, 0, stream>>>(log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
sequence_lengths, \
vocab_size, \
diversity_rate, \
length_penalty); \
if (K < 10) \
batch_topK_kernel<T, K, BLOCK_SIZE_1> \
<<<batch_size, BLOCK_SIZE_1, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids); \
else \
batch_topK_kernel_v2<T, K, 32><<<batch_size, 32, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids); \
break;
#define CASE_K(K, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
case K: \
topk_stage_1_opt3<float, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
<<<batch_size * K * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, \
temp_log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
sequence_lengths, \
beam_width, \
vocab_size, \
length_penalty, \
end_ids); \
topk_stage_2_opt3<float, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_> \
<<<batch_size, BLOCK_SIZE_2_, K * sizeof(int), stream>>>( \
topk_tmp_id_buf, topk_tmp_val_buf, ids, *beam_hyps, end_ids, vocab_size, beam_width); \
sync_check_cuda_error(); \
break;
template<typename T>
void invokeTopkBeamSearch(void* workspace,
size_t& workspace_size,
T* log_probs,
int* ids,
BeamHypotheses* beam_hyps,
const bool* finished,
const int* sequence_lengths,
const int batch_size,
const int beam_width,
const int vocab_size_padded_,
const T diversity_rate,
const float length_penalty,
const int* end_ids,
cudaStream_t stream)
{
TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
// log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
const int vocab_size = vocab_size_padded_;
// Beam size should be less than or equal to vocab size.
assert(beam_width <= vocab_size);
// Beam search needs the sequence lengths of beams to apply length penalty.
assert(length_penalty == 0.0f || sequence_lengths != nullptr);
const int max_block_per_beam = 8;
int temp_log_probs_buf_size = batch_size * beam_width * vocab_size; // type float
int topk_tmp_ids_buf_size = batch_size * beam_width * beam_width * max_block_per_beam; // type int
int topk_tmp_val_buf_size = batch_size * beam_width * beam_width * max_block_per_beam; // type float
// prevent memory misaligned address
temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
if (workspace == nullptr) {
workspace_size = sizeof(float) * temp_log_probs_buf_size + sizeof(int) * topk_tmp_ids_buf_size
+ sizeof(float) * topk_tmp_val_buf_size;
return;
}
else {
T* temp_log_probs = (T*)workspace;
int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
if (diversity_rate == 0.0f) {
switch (beam_width) {
CASE_K(1, 128, 128, 8);
CASE_K(4, 128, 128, 8);
CASE_K(10, 128, 128, 8);
CASE_K(16, 128, 128, 5);
CASE_K(32, 256, 128, 1);
CASE_K(64, 256, 256, 1);
default:
topk_stage_1_opt2_general<T, 128, 1>
<<<batch_size * beam_width * 1, 128, 0, stream>>>(log_probs,
temp_log_probs,
topk_tmp_id_buf,
topk_tmp_val_buf,
finished,
sequence_lengths,
beam_width,
vocab_size,
length_penalty);
topk_stage_2_opt2_general<T, 128, 1>
<<<batch_size,
128,
beam_width * beam_width * 1 * sizeof(float) + beam_width * sizeof(int),
stream>>>(
topk_tmp_id_buf, topk_tmp_val_buf, ids, *beam_hyps, end_ids, beam_width, vocab_size);
break;
}
}
else {
switch (beam_width) {
CASE_K_DIV(1, 256, 256);
CASE_K_DIV(4, 256, 256);
CASE_K_DIV(16, 256, 64);
CASE_K_DIV(32, 256, 64);
CASE_K_DIV(64, 256, 64);
default:
FT_CHECK_WITH_INFO(false, fmtstr("Topk kernel does not support beamwidth = %d \n", beam_width));
break;
}
}
return;
}
}
#undef CASE_K
#undef CASE_K_DIV
template void invokeTopkBeamSearch(void* workspace,
size_t& workspace_size,
float* log_probs,
int* ids,
BeamHypotheses* beam_hyps,
const bool* finished,
const int* sequence_lengths,
const int batch_size,
const int beam_width,
const int vocab_size_padded_,
const float diversity_rate,
const float length_penalty,
const int* end_ids,
cudaStream_t stream);
template<typename T>
__global__ void tileEncoderResults(T* tiled_output,
int* tiled_sequence_length,
const T* output,
const int* sequence_length,
const uint batch_size,
const uint beam_width,
const uint d_model)
{
if (blockIdx.x == 0) {
for (uint i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x) {
tiled_sequence_length[i] = sequence_length[i / beam_width];
}
}
int tgt_offset =
blockIdx.x * gridDim.y * gridDim.z * d_model + blockIdx.y * gridDim.z * d_model + blockIdx.z * d_model;
int src_offset = blockIdx.x * gridDim.z * d_model + blockIdx.z * d_model;
for (uint i = threadIdx.x; i < d_model; i += blockDim.x) {
tiled_output[i + tgt_offset] = output[i + src_offset];
}
}
template<typename T>
void invokeTileEncoderResults(T* tiled_output,
int* tiled_sequence_length,
const T* output,
const int* sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream)
{
// tiled_output: [batch_size, beam_width, mem_max_seq_len, d_model]
// tiled_sequence_length: [batch_size, beam_width]
// output: [batch_size, mem_max_seq_len, d_model]
// sequence_length [batch_size]
dim3 grid(batch_size, beam_width, mem_max_seq_len);
bool is_half2 = (std::is_same<T, half>::value) && (d_model % 2 == 0);
if (is_half2) {
using T2 = typename TypeConverter<T>::Type; // fp16 to half2, bf16 to bf162
dim3 block(min(512, (int)(d_model / 2)));
tileEncoderResults<T2><<<grid, block, 0, stream>>>((T2*)tiled_output,
tiled_sequence_length,
(const T2*)output,
sequence_length,
batch_size,
beam_width,
d_model / 2);
}
else {
dim3 block(min(512, (int)d_model));
tileEncoderResults<T><<<grid, block, 0, stream>>>(
tiled_output, tiled_sequence_length, output, sequence_length, batch_size, beam_width, d_model);
}
}
template void invokeTileEncoderResults(float* tiled_output,
int* tiled_sequence_length,
const float* output,
const int* sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
template void invokeTileEncoderResults(half* tiled_output,
int* tiled_sequence_length,
const half* output,
const int* sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
template void invokeTileEncoderResults(half2* tiled_output,
int* tiled_sequence_length,
const half2* output,
const int* sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeTileEncoderResults(__nv_bfloat16* tiled_output,
int* tiled_sequence_length,
const __nv_bfloat16* output,
const int* sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
#endif
__global__ void insertUnfinishedPath(BeamHypotheses beam_hyps,
const bool* finished,
const float* cum_log_probs,
const int batch_size,
const int beam_width)
{
const int bid = blockIdx.x;
const int tgt_start_idx = beam_hyps.num_beams[bid];
if (beam_hyps.is_done[bid]) {
return;
}
for (int i = 0; i < beam_width; i++) {
if (threadIdx.x == 0) {
const int src_beam_idx = bid * beam_width + i;
const int tgt_beam_idx = bid * beam_width * 2 + i + tgt_start_idx;
const int length = beam_hyps.sequence_lengths_src[src_beam_idx];
beam_hyps.output_ids_tgt[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
beam_hyps.output_ids_src[length * batch_size * beam_width + src_beam_idx];
if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
beam_hyps.log_probs[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
beam_hyps.log_probs_src[length * batch_size * beam_width + src_beam_idx];
}
int prev_id = beam_hyps.parent_ids_src[length * batch_size * beam_width + src_beam_idx];
for (int j = length - 1; j >= 0; j--) {
// output_ids_tgt need to use max_seq_len + 1 because its shape is
// [bs, beam_width, max_seq_len + 1]
beam_hyps.output_ids_tgt[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + j] =
beam_hyps.output_ids_src[j * batch_size * beam_width + bid * beam_width + prev_id];
if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
beam_hyps.log_probs[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + j] =
beam_hyps.log_probs_src[j * batch_size * beam_width + bid * beam_width + prev_id];
}
prev_id = beam_hyps.parent_ids_src[j * batch_size * beam_width + bid * beam_width + prev_id];
}
beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = length;
beam_hyps.normed_scores[tgt_beam_idx] = apply_length_penalty(
cum_log_probs[src_beam_idx], finished[src_beam_idx] ? length + 1 : length, beam_hyps.length_penalty);
beam_hyps.cum_log_probs[tgt_beam_idx] = cum_log_probs[src_beam_idx];
beam_hyps.num_beams[bid]++;
}
}
}
void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
const bool* finished,
const float* cum_log_probs,
const int batch_size,
const int beam_width,
cudaStream_t stream)
{
insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
}
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda_runtime.h>
#pragma once
namespace turbomind {
// In original beam search implementation, if a beam is finished, we set it as finished
// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
//
// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
// next step.
//
// After we collect `beam_width` beams, we will sort them by their norm_scores.
struct BeamHypotheses {
int* output_ids_tgt = nullptr;
int* sequence_lengths_tgt = nullptr;
float* cum_log_probs = nullptr; // cum_log
float* normed_scores = nullptr; // cum_log / (length**length_penalty)
float* log_probs = nullptr; // log probs of each generated token
float* min_normed_scores = nullptr; // record the min normed scores for each batch
int* num_beams = nullptr; // the number of finished beams we collect
bool* is_done = nullptr;
// Used to set inputs
const int* output_ids_src;
const int* parent_ids_src;
const int* sequence_lengths_src;
const int* end_ids;
const float* log_probs_src;
// some variables for kernels
int step;
int ite;
int batch_size;
int local_batch_size;
int max_seq_len;
float length_penalty;
bool early_stopping = true;
bool is_return_normed_score = true; // return normed_cum_log_probs or cum_log_probs
};
template<typename T>
void invokeTopkBeamSearch(void* workspace,
size_t& workspace_size,
T* log_probs,
int* ids,
BeamHypotheses* beam_hyps,
const bool* finished,
const int* sequence_lengths,
const int batch_size,
const int beam_width,
const int vocab_size_padded_,
const T diversity_rate,
const float length_penalty,
const int* end_ids,
cudaStream_t stream);
template<typename T>
void invokeTileEncoderResults(T* tiled_encoder_output,
int* tiled_encoder_sequence_length,
const T* encoder_output,
const int* encoder_sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
const bool* finished,
const float* cum_log_probs,
const int batch_size,
const int beam_width,
cudaStream_t stream);
} // namespace turbomind
......@@ -68,120 +68,6 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
sync_check_cuda_error();
}
template<typename T>
__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int max_seq_len)
{
// sequence_lengths: [batch_size]
// attention_mask: [batch_size, 1, max_seq_len, max_seq_len]
attention_mask += blockIdx.x * max_seq_len * max_seq_len;
const int length = sequence_lengths[blockIdx.x];
for (int i = threadIdx.x; i < max_seq_len * max_seq_len; i += blockDim.x) {
// int row_id = i / max_seq_len;
int col_id = i % max_seq_len;
// if (row_id < length && col_id < length) {
// TODO (bhsueh) check this modification is ok or not on other rmodel
if (col_id < length) {
attention_mask[i] = (T)(1.0f);
}
else {
attention_mask[i] = (T)(0.0f);
}
}
}
template<typename T>
void invokeBuildEncoderAttentionMask(
T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream)
{
buildEncoderAttentionMaskKernel<<<batch_size, 256, 0, stream>>>(attention_mask, sequence_lengths, max_seq_len);
}
template void invokeBuildEncoderAttentionMask(float* attention_mask,
const int* sequence_lengths,
const int batch_size,
const int max_seq_len,
cudaStream_t stream);
template void invokeBuildEncoderAttentionMask(half* attention_mask,
const int* sequence_lengths,
const int batch_size,
const int max_seq_len,
cudaStream_t stream);
#ifdef ENABLE_FP8
template void invokeBuildEncoderAttentionMask(__nv_fp8_e4m3* attention_mask,
const int* sequence_lengths,
const int batch_size,
const int max_seq_len,
cudaStream_t stream);
#endif // ENABLE_FP8
#ifdef ENABLE_BF16
template void invokeBuildEncoderAttentionMask(__nv_bfloat16* attention_mask,
const int* sequence_lengths,
const int batch_size,
const int max_seq_len,
cudaStream_t stream);
#endif
__global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size)
{
// use for get tensorrt fused mha padding offset
// when we remove the padding
extern __shared__ int tmp_offset[];
if (threadIdx.x == 0) {
tmp_offset[0] = 0;
for (int i = 0; i < batch_size; i++) {
tmp_offset[i + 1] = tmp_offset[i] + sequence_length[i];
}
}
__syncthreads();
for (int i = threadIdx.x; i < batch_size + 1; i += blockDim.x) {
trt_mha_padding_offset[i] = tmp_offset[i];
}
}
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int batch_size,
cudaStream_t stream)
{
getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (batch_size + 1), stream>>>(
trt_mha_padding_offset, sequence_length, batch_size);
}
__global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
const int request_seq_len)
{
// use for get tensorrt fused mha padding offset
// when we keep the padding
extern __shared__ int tmp_offset[];
if (threadIdx.x == 0) {
tmp_offset[0] = 0;
for (int i = 0; i < request_batch_size; i++) {
tmp_offset[i * 2 + 1] = tmp_offset[i * 2] + sequence_length[i];
tmp_offset[i * 2 + 2] = request_seq_len * (i + 1);
}
}
__syncthreads();
for (int i = threadIdx.x; i < 2 * request_batch_size + 1; i += blockDim.x) {
trt_mha_padding_offset[i] = tmp_offset[i];
}
}
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
const int request_seq_len,
cudaStream_t stream)
{
getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (2 * request_batch_size + 1), stream>>>(
trt_mha_padding_offset, sequence_length, request_batch_size, request_seq_len);
}
template<typename T>
__global__ void rebuild_sequence_length_padding(const T* src, T* dst, const int* padding_offset, const int n)
{
......@@ -287,183 +173,4 @@ template void invokeRemovePadding(__nv_bfloat16* dst,
cudaStream_t stream);
#endif
template<typename T>
__global__ void buildRelativeAttentionBias(T* relative_attention_bias,
const T* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance)
{
const int head_id = blockIdx.x;
for (int seq_id = threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x) {
int row_id = seq_id / seq_len;
int col_id = seq_id % seq_len;
int relative_position = col_id - row_id;
int relative_buckets = 0;
int tmp_num_bucket = num_bucket;
if (is_bidirectional) {
tmp_num_bucket /= 2;
if (relative_position > 0) {
relative_buckets += tmp_num_bucket;
}
else {
relative_position *= -1;
}
}
else {
relative_position = abs(relative_position);
}
int max_exact = tmp_num_bucket / 2;
bool is_small = relative_position < max_exact;
int relative_position_if_large =
max_exact
+ (int)(logf(relative_position * 1.0f / max_exact) / logf((float)max_distance / max_exact)
* (tmp_num_bucket - max_exact));
relative_position_if_large = min(relative_position_if_large, tmp_num_bucket - 1);
relative_buckets += is_small ? relative_position : relative_position_if_large;
relative_attention_bias[head_id * seq_len * seq_len + seq_id] =
relative_attention_bias_table[head_id * num_bucket + relative_buckets];
}
}
template<typename T>
void invokeBuildRelativeAttentionBias(T* relative_attention_bias,
const T* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream)
{
if (position_embedding_type == PositionEmbeddingType::absolute) {
return;
}
dim3 grid(head_num);
dim3 block(256);
buildRelativeAttentionBias<<<grid, block, 0, stream>>>(relative_attention_bias,
relative_attention_bias_table,
head_num,
seq_len,
num_bucket,
is_bidirectional,
max_distance);
}
template void invokeBuildRelativeAttentionBias(float* relative_attention_bias,
const float* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream);
template void invokeBuildRelativeAttentionBias(half* relative_attention_bias,
const half* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeBuildRelativeAttentionBias(__nv_bfloat16* relative_attention_bias,
const __nv_bfloat16* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream);
#endif
#ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN>
__global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
{
param.output[blockIdx.x * param.d_model + threadIdx.x] = (T_OUT)(
(float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x] * __ldg(param.input_scale));
}
template<typename T_OUT, typename T_IN>
void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
{
FT_CHECK(param.d_model <= 1024);
getLastTokenDequantize<T_OUT, T_IN><<<param.batch_size, param.d_model, 0, param.stream>>>(param);
}
template void invokeGetLastTokenDequantize<__nv_bfloat16, __nv_fp8_e4m3>(
getLastTokenDequantizeParam<__nv_bfloat16, __nv_fp8_e4m3> param);
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
__global__ void quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
{
for (int i = threadIdx.x; i < param.d_model; i += blockDim.x) {
int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : param.padding_offset[blockIdx.x]);
if (quantize_mode == QUANTIZE_MODE::PER_TENSOR) {
param.dst[padded_row_id * param.d_model + i] =
(T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale));
}
else if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
param.dst[padded_row_id * param.d_model + i] =
(T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale + i));
}
}
}
template<>
__global__ void
quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param)
{
int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : __ldg(&param.padding_offset[blockIdx.x]));
__nv_fp8x4_e4m3* src_ptr = ((__nv_fp8x4_e4m3*)param.src) + blockIdx.x * (param.d_model / 4);
half2* dst_ptr = ((half2*)param.dst) + padded_row_id * (param.d_model / 2);
half2 scale = cuda_cast<half2>(__ldg(param.scale));
for (int i = threadIdx.x; i < param.d_model / 4; i += blockDim.x) {
half2 val_0;
half2 val_1;
fp8x4_e4m3_to_half2(&val_0, &val_1, src_ptr + i);
val_0 = hmul2(val_0, scale);
val_1 = hmul2(val_1, scale);
dst_ptr[2 * i + 0] = val_0;
dst_ptr[2 * i + 1] = val_1;
}
}
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
{
dim3 grid(param.token_num);
dim3 block(param.d_model);
FT_CHECK(block.x <= 1024);
if (block.x % 4 == 0) {
block.x /= 4;
}
quantizeMatrixRebuildPadding<<<grid, block, 0, param.stream>>>(param);
}
template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR>(
QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param);
#endif
} // namespace turbomind
......@@ -15,7 +15,6 @@
*/
#pragma once
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
......@@ -46,21 +45,6 @@ inline void invokeGetPaddingOffset(size_t* h_pinned_token_num,
h_pinned_token_num, h_token_num, tmp_mask_offset, nullptr, sequence_length, batch_size, max_seq_len, stream);
}
template<typename T>
void invokeBuildEncoderAttentionMask(
T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream);
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
cudaStream_t stream);
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
const int request_seq_len,
cudaStream_t stream);
template<typename T>
void invokeRebuildPadding(
T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
......@@ -69,46 +53,4 @@ template<typename T>
void invokeRemovePadding(
T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
template<typename T>
void invokeBuildRelativeAttentionBias(T* relative_attention_bias,
const T* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream);
template<typename T_OUT, typename T_IN>
struct getLastTokenDequantizeParam {
T_OUT* const output;
T_IN const* const input;
float const* const input_scale;
const int batch_size;
const int max_seq_len;
const int d_model;
cudaStream_t stream;
};
template<typename T_OUT, typename T_IN>
void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param);
#ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
struct QuantizeMatrixRebuildPaddingParam {
T_OUT* dst;
const T_IN* src;
const int* padding_offset;
const int token_num;
const int d_model;
const float* scale;
cudaStream_t stream;
};
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
#endif // ENABLE_FP8
} // namespace turbomind
......@@ -16,7 +16,6 @@
#pragma once
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cuda_fp16.h>
......
......@@ -21,81 +21,6 @@
namespace turbomind {
// static const float HALF_FLT_MAX = 65504.F;
template<typename T>
__global__ void decodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
T* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length)
{
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? (T)HALF_FLT_MAX : (T)1e20f; // BF16 and FP32 have the same dynamic range
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width;
index += blockDim.x * gridDim.x) {
finished[index] = false;
sequence_length[index] = max_input_length;
if (word_ids != nullptr) {
word_ids[index] = sentence_ids[index / beam_width];
}
cum_log_probs[index] = (index % beam_width == 0) ? (T)0.0f : (T)-MAX_T_VAL;
}
}
template<typename T>
void invokeDecodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
T* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length,
cudaStream_t stream)
{
dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256));
dim3 block(256);
decodingInitialize<T><<<grid, block, 0, stream>>>(
finished, sequence_length, word_ids, cum_log_probs, sentence_ids, batch_size, beam_width, max_input_length);
}
template void invokeDecodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
float* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length,
cudaStream_t stream);
template void invokeDecodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
half* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeDecodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
__nv_bfloat16* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length,
cudaStream_t stream);
#endif
// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
template<typename T>
__global__ void embeddingLookupPosEncoding(T* from_tensor,
......@@ -364,33 +289,33 @@ void invokePaddingEmbedding(T* padded_embedding_kernel,
vocab_size_padded);
}
template void invokePaddingEmbedding(float* padded_embedding_kernel,
float* padded_embedding_bias,
const float* embedding_kernel,
const float* embedding_bias,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
template void invokePaddingEmbedding(half* padded_embedding_kernel,
half* padded_embedding_bias,
const half* embedding_kernel,
const half* embedding_bias,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokePaddingEmbedding(__nv_bfloat16* padded_embedding_kernel,
__nv_bfloat16* padded_embedding_bias,
const __nv_bfloat16* embedding_kernel,
const __nv_bfloat16* embedding_bias,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
#endif
// template void invokePaddingEmbedding(float* padded_embedding_kernel,
// float* padded_embedding_bias,
// const float* embedding_kernel,
// const float* embedding_bias,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// template void invokePaddingEmbedding(half* padded_embedding_kernel,
// half* padded_embedding_bias,
// const half* embedding_kernel,
// const half* embedding_bias,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #ifdef ENABLE_BF16
// template void invokePaddingEmbedding(__nv_bfloat16* padded_embedding_kernel,
// __nv_bfloat16* padded_embedding_bias,
// const __nv_bfloat16* embedding_kernel,
// const __nv_bfloat16* embedding_bias,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #endif
template<typename T>
__global__ void paddingEmbeddingKernel(T* padded_embedding_kernel,
......@@ -426,256 +351,28 @@ void invokePaddingEmbeddingKernel(T* padded_embedding_kernel,
padded_embedding_kernel, embedding_kernel, hidden_unit, vocab_size, vocab_size_padded);
}
template void invokePaddingEmbeddingKernel(float* padded_embedding_kernel,
const float* embedding_kernel,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
template void invokePaddingEmbeddingKernel(half* padded_embedding_kernel,
const half* embedding_kernel,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokePaddingEmbeddingKernel(__nv_bfloat16* padded_embedding_kernel,
const __nv_bfloat16* embedding_kernel,
const int hidden_unit,
const int vocab_size,
const int vocab_size_padded,
cudaStream_t stream);
#endif
__global__ void gatherTree(gatherTreeParam param)
{
// PREFIX SOFT PROMPT
// beam: have six parts
// [prompt | input | input_padding | prompt_padding | generated output | padding (use end_token)]
// parents: have five parts
// [prompt | input | input_padding | prompt_padding | generated output | padding (use 0)]
// step_ids: need to remove prompt, input_padding and prompt_padding
// the shape is [input_length + requested_output_length, bs, beam_width]
// need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
// max_input_length: input + input_padding + prompt_padding
// P/PROMPT TUNING
// NOTE: input (real ids | prompt virtual ids) have already been preprocessed during embedding lookup, no prompt
// templates now beam: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
// end_token)] parents: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
// 0)] step_ids: need to remove virtual prompt ids in input ids
// the shape is [input_length (real input length, prompt length) + requested_output_length, bs, beam_width]
// need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
// max_input_length: input (real ids | prompt virtual ids) + input_padding
const int max_input_length = param.input_lengths == nullptr ? 0 : param.max_input_length;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < param.batch_size * param.beam_width;
i += gridDim.x * blockDim.x) {
const int batch = i / param.beam_width;
const int beam = i % param.beam_width;
const int prompt_len =
param.prefix_soft_prompt_lengths == nullptr ? 0 : param.prefix_soft_prompt_lengths[batch];
int input_len = param.input_lengths == nullptr ? 0 : param.input_lengths[i];
// virtual prompts mean the prompt embedded in input ids (with prompt templates) [p/prompt tuning]
const int virtual_prompt_length =
param.p_prompt_tuning_prompt_lengths == nullptr ? 0 : param.p_prompt_tuning_prompt_lengths[batch];
// real input length (without virtual prompts) [p/prompt tuning]
input_len -= virtual_prompt_length;
const int* parent_ids = param.parent_ids;
const int* step_ids = param.step_ids;
// TODO(bhsueh) optimize the reduce_max operation for large beam_width
int max_len = -1;
bool update_response_input_length = param.response_input_lengths != nullptr;
// int selected_beam_index = 0;
for (int j = 0; j < param.beam_width; j++) {
int tmp_len =
param.max_sequence_lengths[batch * param.beam_width + j] + param.max_sequence_length_final_step;
// also remove the length of the soft prompts, p_prompt_tuning
param.max_sequence_lengths[batch * param.beam_width + j] =
tmp_len - param.max_prefix_soft_prompt_length
- (param.max_input_length - param.max_input_without_prompt_length);
// update the response input length
if (update_response_input_length) {
param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;
}
if (tmp_len > max_len) {
max_len = tmp_len;
// selected_beam_index = j;
}
}
const int max_seq_len_b = min(param.max_time, max_len);
if (max_seq_len_b <= 0) {
continue;
}
#define GET_IX(time_ix, beam_ix) \
(param.batch_size * param.beam_width * (time_ix) + param.beam_width * batch + (beam_ix))
const int padding_offset_and_prompt_offset = max_input_length - input_len + prompt_len;
const int initial_tgt_ix = GET_IX(max_seq_len_b - 1 - padding_offset_and_prompt_offset, beam);
const int initial_parent_ix = GET_IX(max_seq_len_b - 1, beam);
param.beams[initial_tgt_ix] = __ldg(step_ids + initial_parent_ix);
int parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + initial_parent_ix) % param.beam_width;
bool found_bad = false;
for (int level = max_seq_len_b - 2; level >= 0; --level) {
if (level < prompt_len || (level >= input_len && level < max_input_length)) {
continue;
}
int tgt_level = level >= max_input_length ? level - padding_offset_and_prompt_offset : level - prompt_len;
const int level_beam_ix = GET_IX(tgt_level, beam);
const int level_parent_ix = GET_IX(level, parent);
if (parent < 0 || parent > param.beam_width) {
// param.beams[level_beam_ix] = -1;
param.beams[level_beam_ix] = param.end_tokens[batch];
parent = -1;
found_bad = true;
}
else {
param.beams[level_beam_ix] = __ldg(step_ids + level_parent_ix);
parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + level_parent_ix) % param.beam_width;
}
}
// set the padded part as end_token
// input_len
for (int index = max_len - padding_offset_and_prompt_offset;
index < param.max_time - param.max_prefix_soft_prompt_length;
++index) {
param.beams[GET_IX(index, beam)] = param.end_tokens[batch];
}
// Not necessary when using a BeamSearchDecoder, but necessary
// when a user feeds in possibly broken trajectory (i.e., non-eos
// entries in a beam following eos entries).
if (!found_bad) {
bool finished = false;
// skip the step 0 because it is often the start token
int start_step = max_input_length == 0 ? 1 : max_input_length;
for (int time = start_step; time < max_seq_len_b; ++time) {
const int level_beam_ix = GET_IX(time, beam);
if (finished) {
param.beams[level_beam_ix] = param.end_tokens[batch];
}
else if (param.beams[level_beam_ix] == param.end_tokens[batch]) {
finished = true;
}
}
}
#undef GET_IX
// transpose on output_ids
// remove p_prompt tuning virtual tokens (end tokens)
int actual_output_length = param.max_time - param.max_prefix_soft_prompt_length
- (param.max_input_length - param.max_input_without_prompt_length);
if (param.output_ids != nullptr) {
for (int j = 0; j < actual_output_length; j++) {
param.output_ids[i * actual_output_length + j] =
param.beams[j * param.batch_size * param.beam_width + i];
}
}
}
}
void invokeGatherTree(int* beams,
int* max_sequence_lengths,
const int max_time,
const int batch_size,
const int beam_width,
const int* step_ids,
const int* parent_ids,
const int* end_tokens,
cudaStream_t stream)
{
gatherTreeParam param;
param.beams = beams;
param.max_sequence_lengths = max_sequence_lengths;
param.max_time = max_time;
param.batch_size = batch_size;
param.beam_width = beam_width;
param.step_ids = step_ids;
param.parent_ids = parent_ids;
param.end_tokens = end_tokens;
param.max_input_length = 1;
param.prefix_soft_prompt_lengths = nullptr;
param.stream = stream;
invokeGatherTree(param);
}
void invokeGatherTree(int* beams,
int* max_sequence_lengths,
const int max_time,
const int batch_size,
const int beam_width,
const int* step_ids,
const int* parent_ids,
const int* end_tokens,
const int max_input_length,
cudaStream_t stream)
{
gatherTreeParam param;
param.beams = beams;
param.max_sequence_lengths = max_sequence_lengths;
param.max_time = max_time;
param.batch_size = batch_size;
param.beam_width = beam_width;
param.step_ids = step_ids;
param.parent_ids = parent_ids;
param.end_tokens = end_tokens;
param.max_input_length = max_input_length;
param.prefix_soft_prompt_lengths = nullptr;
param.stream = stream;
invokeGatherTree(param);
}
void invokeGatherTree(gatherTreeParam param)
{
int batchbeam = param.batch_size * param.beam_width;
dim3 grid(1), block(batchbeam);
// though decoder do not support > 1024 for now
if (batchbeam > 1024) {
grid.x = ceil(param.batch_size * param.beam_width / 1024.);
block.x = 1024;
}
gatherTree<<<grid, block, 0, param.stream>>>(param);
}
__global__ void minusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < token_num; i += blockDim.x * gridDim.x) {
if (finished[i] == false) {
sequence_lengths[i] -= 1;
}
}
}
void invokeMinusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream)
{
dim3 block(min(256, token_num));
dim3 grid(ceil(token_num / 256.));
minusUnfinishedSeqlen<<<block, grid, 0, stream>>>(sequence_lengths, finished, token_num);
}
__global__ void plusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < token_num; i += blockDim.x * gridDim.x) {
if (finished[i] == false) {
sequence_lengths[i] += 1;
}
}
}
void invokePlusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream)
{
dim3 block(min(256, token_num));
dim3 grid(ceil(token_num / 256.));
plusUnfinishedSeqlen<<<block, grid, 0, stream>>>(sequence_lengths, finished, token_num);
}
// template void invokePaddingEmbeddingKernel(float* padded_embedding_kernel,
// const float* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// template void invokePaddingEmbeddingKernel(half* padded_embedding_kernel,
// const half* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #ifdef ENABLE_BF16
// template void invokePaddingEmbeddingKernel(__nv_bfloat16* padded_embedding_kernel,
// const __nv_bfloat16* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #endif
template<typename T>
__global__ void plusScalar(T* buf, const T val, const int size)
......@@ -695,112 +392,4 @@ void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream)
template void invokePlusScalar(int* buf, const int val, const int size, cudaStream_t stream);
__global__ void finalize(int* output_ids,
int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
const int* topk_output_ids,
const int* topk_sequence_lengths,
const float* scores,
const float* topk_cum_log_probs,
const float* topk_log_probs,
const int* num_beams,
const int beam_width,
const int max_seq_len)
{
// output_ids: [bs, beam_width, max_seq_len]
// sequence_lengths: [bs, beam_width]
// cum_log_probs: [bs, beam_width]
// output_log_probs: [bs, beam_width, max_seq_len]
// topk_output_ids: [bs, 2 * beam_width, max_seq_len + 1]
// topk_sequence_lengths: [bs, 2 * beam_width]
// scores: [bs, 2 * beam_width]
// topk_cum_log_probs: [bs, 2 * beam_width]
// topk_log_probs: [bs, 2 * beam_width, max_seq_len + 1]
// num_beams: [bs]
// This kernel do a sorting for scores first, and then put the topk_output_ids
// into output_ids by the rank of scores.
// Note that we remove the start_token (the id at first position) from topk_output_ids
extern __shared__ char array[];
int* rank = (int*)(array);
float* s_scores = (float*)(rank + beam_width);
if (threadIdx.x < num_beams[blockIdx.x]) {
s_scores[threadIdx.x] = scores[blockIdx.x * beam_width * 2 + threadIdx.x];
}
__syncthreads();
for (int i = 0; i < beam_width; i++) {
float score = threadIdx.x < num_beams[blockIdx.x] ? s_scores[threadIdx.x] : -FLT_MAX;
float max_score = blockReduceMax<float>(score);
if (threadIdx.x == 0) {
for (int j = 0; j < beam_width * 2; j++) {
if (s_scores[j] == max_score) {
rank[i] = j;
s_scores[j] = -FLT_MAX;
break;
}
}
}
__syncthreads();
}
if (threadIdx.x < beam_width) {
sequence_lengths[blockIdx.x * beam_width + threadIdx.x] =
topk_sequence_lengths[blockIdx.x * beam_width * 2 + rank[threadIdx.x]];
if (cum_log_probs != nullptr) {
cum_log_probs[blockIdx.x * beam_width + threadIdx.x] =
topk_cum_log_probs[blockIdx.x * beam_width * 2 + rank[threadIdx.x]];
}
}
for (int beam_idx = 0; beam_idx < beam_width; beam_idx++) {
// start from step 1 to skip the start token
for (int i = threadIdx.x; i < sequence_lengths[blockIdx.x * beam_width + beam_idx]; i += blockDim.x) {
output_ids[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] =
topk_output_ids[blockIdx.x * (beam_width * 2) * (max_seq_len + 1) + rank[beam_idx] * (max_seq_len + 1)
+ (i + 1)];
if (output_log_probs != nullptr) {
output_log_probs[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] =
topk_log_probs[blockIdx.x * (beam_width * 2) * (max_seq_len + 1)
+ rank[beam_idx] * (max_seq_len + 1) + (i + 1)];
}
}
}
}
void invokeFinalize(int* output_ids,
int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
const int* topk_output_ids,
const int* topk_sequence_lengths,
const float* scores,
const float* topk_cum_log_probs,
const float* topk_log_probs,
const int* num_beams,
const int beam_width,
const int max_seq_len,
const int batch_size,
cudaStream_t stream)
{
dim3 block(beam_width * 2);
block.x = (block.x + 31) / 32 * 32;
FT_CHECK(block.x < 1024);
finalize<<<batch_size, block, beam_width * sizeof(int) + (beam_width * 2) * sizeof(float), stream>>>(
output_ids,
sequence_lengths,
cum_log_probs,
output_log_probs,
topk_output_ids,
topk_sequence_lengths,
scores,
topk_cum_log_probs,
topk_log_probs,
num_beams,
beam_width,
max_seq_len);
}
} // namespace turbomind
......@@ -22,17 +22,6 @@
namespace turbomind {
template<typename T>
void invokeDecodingInitialize(bool* finished,
int* sequence_length,
int* word_ids,
T* cum_log_probs,
const int* sentence_ids,
const int batch_size,
const int beam_width,
const int max_input_length,
cudaStream_t stream);
// get token from all_ids at step, then lookup from the embedding table
// by the token
template<typename T>
......@@ -99,72 +88,7 @@ void invokePaddingEmbeddingKernel(T* padded_embedding_kernel,
const int vocab_size_padded,
cudaStream_t stream);
void invokeGatherTree(int* beams,
int* max_sequence_lengths,
const int max_time,
const int batch_size,
const int beam_width,
const int* step_ids,
const int* parent_ids,
const int* end_tokens,
cudaStream_t stream);
void invokeGatherTree(int* beams,
int* max_sequence_lengths,
const int max_time,
const int batch_size,
const int beam_width,
const int* step_ids,
const int* parent_ids,
const int* end_tokens,
const int max_input_length,
cudaStream_t stream);
struct gatherTreeParam {
int* beams = nullptr;
int* max_sequence_lengths = nullptr;
int max_sequence_length_final_step = 0;
const int* input_lengths = nullptr;
// response input lengths (used to slice the ids during postprocessing)
int* response_input_lengths = nullptr;
int max_time = 0;
int batch_size = 0;
int beam_width = 0;
const int* step_ids = nullptr;
const int* parent_ids = nullptr;
const int* end_tokens = nullptr;
int max_input_length = 0;
const int* prefix_soft_prompt_lengths = nullptr;
// p_prompt_tuning prompt leangths, used to remove prompts during post-processing
const int* p_prompt_tuning_prompt_lengths = nullptr;
int max_input_without_prompt_length = 0;
// prefix soft prompt
int max_prefix_soft_prompt_length = 0;
int* output_ids = nullptr;
cudaStream_t stream;
};
void invokeGatherTree(gatherTreeParam param);
void invokeMinusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
void invokePlusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
template<typename T>
void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream);
void invokeFinalize(int* output_ids,
int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
const int* topk_output_ids,
const int* topk_sequence_lengths,
const float* scores,
const float* topk_cum_log_probs,
const float* topk_log_probs,
const int* num_beams,
const int beam_width,
const int max_seq_len,
const int batch_size,
cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublas_v2.h"
#include "gen_relative_pos_bias.h"
#include "reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cstdio>
namespace turbomind {
/******************* invokeGenRelativePosBias ***********************/
// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
// relative_position_bias is [head_num, window_size^2, window_size^2]
// grid(window_size*window_size, head_num)
// block(window_size*window_size)
template<typename T, typename Tindex>
__global__ void gen_relative_pos_bias(T* relative_position_bias,
const T* relative_position_bias_table,
const Tindex* relative_position_bias_index,
const int window_size,
const int head_num)
{
const int h_in_window = blockIdx.x / window_size;
const int w_in_window = blockIdx.x % window_size;
const int h_in_token = threadIdx.x / window_size;
const int w_in_token = threadIdx.x % window_size;
const int head_idx = blockIdx.y;
const int elements_per_window = window_size * window_size;
const size_t elements_per_window_2 = elements_per_window * elements_per_window;
const size_t output_idx = head_idx * elements_per_window_2 + blockIdx.x * elements_per_window + threadIdx.x;
if (output_idx < head_num * elements_per_window_2) {
const Tindex idx_in_table =
relative_position_bias_index[(h_in_window * window_size + w_in_window) * elements_per_window
+ h_in_token * window_size + w_in_token];
relative_position_bias[output_idx] = relative_position_bias_table[idx_in_table * head_num + head_idx];
}
}
template<typename T, typename Tindex>
void invokeGenRelativePosBias(T* relative_position_bias,
const T* relative_position_bias_table,
const Tindex* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream)
{
dim3 grid(window_size * window_size, head_num);
dim3 block(window_size * window_size);
if (block.x > 1024) {
printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
exit(-1);
}
gen_relative_pos_bias<<<grid, block, 0, stream>>>(
relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
}
/******************* invokeGenRelativePosBiasV2 ***********************/
template<typename T, typename Tindex>
void invokeGenRelativePosBiasV2(T* relative_position_bias,
const T* relative_coords_table,
const Tindex* relative_position_bias_index,
const T* cpb_mlp_weight1,
const T* cpb_mlp_bias1,
const T* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream)
{
dim3 grid(window_size * window_size, head_num);
dim3 block(window_size * window_size);
if (block.x > 1024) {
printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
exit(-1);
}
T* relative_position_bias_table;
check_cuda_error(cudaMalloc(&relative_position_bias_table,
((2 * window_size - 1) * (2 * window_size - 1) * head_num) * sizeof(T)));
T* cpb_mlp_1;
check_cuda_error(
cudaMalloc(&cpb_mlp_1, ((2 * window_size - 1) * (2 * window_size - 1) * cpb_mlp_out_dim) * sizeof(T)));
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
int m = (2 * window_size - 1) * (2 * window_size - 1);
T alpha = (T)1.0f;
T beta = (T)0.0f;
cudaDataType_t type = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t compute_type = std::is_same<float, T>::value ? CUBLAS_COMPUTE_32F : CUBLAS_COMPUTE_16F;
#else
cudaDataType_t compute_type = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
#endif
cublasGemmAlgo_t algo = std::is_same<float, T>::value ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
check_cuda_error(cublasGemmEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
cpb_mlp_out_dim,
m,
cpb_mlp_in_dim,
&alpha,
cpb_mlp_weight1,
type,
cpb_mlp_in_dim,
relative_coords_table,
type,
cpb_mlp_in_dim,
&beta,
cpb_mlp_1,
type,
cpb_mlp_out_dim,
compute_type,
algo));
invokeGenericActivation<ReluActivation, T, T>(
cpb_mlp_1, cpb_mlp_bias1, nullptr, nullptr, nullptr, nullptr, m, cpb_mlp_out_dim, 0, nullptr, nullptr, stream);
check_cuda_error(cublasGemmEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
head_num,
m,
cpb_mlp_out_dim,
&alpha,
cpb_mlp_weight2,
type,
cpb_mlp_out_dim,
cpb_mlp_1,
type,
cpb_mlp_out_dim,
&beta,
relative_position_bias_table,
type,
head_num,
compute_type,
algo));
gen_relative_pos_bias<<<grid, block, 0, stream>>>(
relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
invokeSigmoid(
relative_position_bias, window_size * window_size * window_size * window_size * head_num, 16.0f, stream);
check_cuda_error(cudaFree(relative_position_bias_table));
check_cuda_error(cudaFree(cpb_mlp_1));
check_cuda_error(cublasDestroy(cublas_handle));
}
/******************* instantiation ***********************/
template void invokeGenRelativePosBias(float* relative_position_bias,
const float* relative_position_bias_table,
const int* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBias(half* relative_position_bias,
const half* relative_position_bias_table,
const int* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBias(float* relative_position_bias,
const float* relative_position_bias_table,
const int64_t* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBias(half* relative_position_bias,
const half* relative_position_bias_table,
const int64_t* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream);
__host__ __device__ uint32_t pow2_rounddown(uint32_t x)
{
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x >>= 1;
return x + 1;
}
template<typename T>
__global__ void generate_alibi_slopes(T* alibi_slopes, const size_t num_heads)
{
if (threadIdx.x < num_heads) {
// The nearest power of 2 greater than num_heads followed by HF's implementation.
int num_heads_pow2 = pow2_rounddown(num_heads);
// Loop over the attention head.
for (int h = threadIdx.x; h < num_heads; h += blockDim.x) {
if (h < num_heads_pow2) {
alibi_slopes[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
}
else {
alibi_slopes[h] = static_cast<T>(
powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
}
}
}
}
template<typename T>
void invokeBuildAlibiSlopes(T* alibi_slopes, const size_t num_heads, cudaStream_t stream)
{
// Generate the slopes of a linear attention linear bias.
//
// Paper: https://arxiv.org/abs/2108.12409
// HF's implementation
// https://github.com/huggingface/transformers/blob/56ef0ba44765162f830873c140bd40bdc975cc34/src/transformers/models/bloom/modeling_bloom.py#L86
// Author's implementation
// https://github.com/ofirpress/attention_with_linear_biases/blob/02aa87e7a29e9340efd28d6d169018eafb3aa57a/fairseq/models/transformer.py#L760
//
// alibi_slopes: [num_heads],
// strictly follows how HF implements. which treats power-of-2 heads, and non-power-of-2 heads differently.
// what paper generates differs with HF's when number of heads is not a power of 2.
// num_heads: the number of attention heads.
// stream: a cuda stream.
dim3 block(min((int)num_heads, 512));
generate_alibi_slopes<<<1, block, 0, stream>>>(alibi_slopes, num_heads);
}
template void invokeBuildAlibiSlopes(float* alibi_slopes, const size_t num_heads, cudaStream_t stream);
template void invokeBuildAlibiSlopes(half* alibi_slopes, const size_t num_heads, cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeBuildAlibiSlopes(__nv_bfloat16* alibi_slopes, const size_t num_heads, cudaStream_t stream);
#endif
template void invokeGenRelativePosBiasV2(float* relative_position_bias,
const float* relative_coords_table,
const int* relative_position_bias_index,
const float* cpb_mlp_weight1,
const float* cpb_mlp_bias1,
const float* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBiasV2(half* relative_position_bias,
const half* relative_coords_table,
const int* relative_position_bias_index,
const half* cpb_mlp_weight1,
const half* cpb_mlp_bias1,
const half* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBiasV2(float* relative_position_bias,
const float* relative_coords_table,
const int64_t* relative_position_bias_index,
const float* cpb_mlp_weight1,
const float* cpb_mlp_bias1,
const float* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
template void invokeGenRelativePosBiasV2(half* relative_position_bias,
const half* relative_coords_table,
const int64_t* relative_position_bias_index,
const half* cpb_mlp_weight1,
const half* cpb_mlp_bias1,
const half* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <stdint.h>
namespace turbomind {
enum class PositionEmbeddingType
{
relative,
absolute,
};
template<typename T, typename Tindex>
void invokeGenRelativePosBias(T* relative_position_bias,
const T* relative_position_bias_table,
const Tindex* relative_position_bias_index,
const int window_size,
const int head_num,
cudaStream_t stream);
template<typename T>
void invokeBuildAlibiSlopes(T* linear_position_bias_slopes, const size_t head_num, cudaStream_t stream);
template<typename T, typename Tindex>
void invokeGenRelativePosBiasV2(T* relative_position_bias,
const T* relative_coords_table,
const Tindex* relative_position_bias_index,
const T* cpb_mlp_weight1,
const T* cpb_mlp_bias1,
const T* cpb_mlp_weight2,
const int window_size,
const int cpb_mlp_in_dim,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
} // namespace turbomind
......@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float* cum_log_probs,
cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first);
}
template void invokeLogProbFromLogits(float* cum_log_probs,
const float* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded,
void* workspace,
const size_t workspace_size,
cudaStream_t stream,
const bool batch_first);
template void invokeLogProbFromLogits(float* cum_log_probs,
const half* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded,
void* workspace,
const size_t workspace_size,
cudaStream_t stream,
const bool batch_first);
// template void invokeLogProbFromLogits(float* cum_log_probs,
// const float* logits,
// const int* input_ids,
// const int* input_lengths,
// const size_t max_input_length,
// const size_t batch_size,
// const size_t vocab_size,
// const size_t vocab_size_padded,
// void* workspace,
// const size_t workspace_size,
// cudaStream_t stream,
// const bool batch_first);
// template void invokeLogProbFromLogits(float* cum_log_probs,
// const half* logits,
// const int* input_ids,
// const int* input_lengths,
// const size_t max_input_length,
// const size_t batch_size,
// const size_t vocab_size,
// const size_t vocab_size_padded,
// void* workspace,
// const size_t workspace_size,
// cudaStream_t stream,
// const bool batch_first);
} // end of namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
#define DO_SPLIT_SMALL_TOP_K_SOFTMAX
static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256;
#define TOPK_FP16_STORAGE 0
template<typename T>
__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
{
// score = log(prob) / (length)^length_penalty.
if (length_penalty == 0.0f || length == 1) {
return log_prob;
}
return log_prob / static_cast<T>(powf(length, length_penalty));
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__
void batch_topK_kernel(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
{
int thread_id = threadIdx.x;
int block_id = blockIdx.x;
TopK<T, MAX_K> partial;
if (thread_id == 0) {
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -FLT_MAX;
}
int index = block_id * MAX_K * MAX_K;
for (int i = 0; i < MAX_K * MAX_K; i++) {
partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
}
index = block_id * MAX_K;
for (int i = 0; i < MAX_K; i++) {
id_buf[index + i] = partial.p[i];
}
}
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__ void batch_topK_kernel(const int* __restrict topk_tmp_id_buf,
const T* __restrict topk_tmp_val_buf,
int* __restrict id_buf,
T* __restrict val_buf)
{
int thread_id = threadIdx.x;
int block_id = blockIdx.x;
TopK<T, MAX_K> partial;
if (thread_id == 0) {
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -FLT_MAX;
}
int index = block_id * MAX_K * MAX_K;
for (int i = 0; i < MAX_K * MAX_K; i++) {
partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
}
index = block_id * MAX_K;
for (int i = 0; i < MAX_K; i++) {
id_buf[index + i] = partial.p[i];
val_buf[index + i] = partial.u[i];
}
}
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__ void batch_topk_kernel(const int* __restrict x,
const T* __restrict y,
int* __restrict z,
float* __restrict v,
float* output_log_probs,
const bool* finished,
const int* sequence_lengths,
BeamHypotheses beam_hyps,
const int V,
const int K,
const int vocab_size,
const float length_penalty,
const T diversity_rate)
{
int thread_id = threadIdx.x;
int vector_id = blockIdx.x;
// reposition x, y to data for the current vector
x += vector_id * V;
y += vector_id * V;
typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ int selected_beams;
__shared__ float old_cum_log_probs[MAX_K];
if (thread_id == 0) {
selected_beams = 0;
}
if (thread_id < K) {
old_cum_log_probs[thread_id] = v[vector_id * K + thread_id];
}
__syncthreads();
if (beam_hyps.num_beams != nullptr) {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + vector_id;
if (beam_hyps.num_beams[global_batch_idx] == 0 && thread_id == 0) {
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
}
else if (beam_hyps.num_beams[global_batch_idx] == K) {
return;
}
}
TopK<T, MAX_K> partial;
for (int i = 0; i < MAX_K; ++i) {
partial.p[i] = -1;
partial.u[i] = -FLT_MAX;
}
for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
int i = elem_id % K;
T elem = length_penalty == 0.0f ? y[elem_id] :
apply_length_penalty(y[elem_id],
finished[vector_id] ? sequence_lengths[vector_id] :
sequence_lengths[vector_id] + 1,
length_penalty);
elem += diversity_rate * (T)i;
int elem_idx = elem_id; // x[elem_id];
partial.insert(elem, elem_idx);
}
TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
if (thread_id == 0) {
z += vector_id * K;
v += vector_id * K;
for (int i = 0; i < MAX_K; ++i) {
if (beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id]) {
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if (i >= K) {
// do nothing
}
else {
const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + vector_id;
const float normed_score = (float)total.u[i];
const int num_beam = beam_hyps.num_beams[global_batch_idx];
int beam_idx = num_beam;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if (num_beam == K) {
if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
// end the tracing and exist this for loop
selected_beams = K;
break;
}
else {
// find the beam index which's score = min_normed_score, erase it.
for (int j = 0; j < K; j++) {
if (beam_hyps.normed_scores[global_batch_idx * (K * 2) + j]
== beam_hyps.min_normed_scores[global_batch_idx]) {
beam_idx = j;
beam_hyps.num_beams[global_batch_idx]--;
beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
beam_hyps.normed_scores[global_batch_idx * (K * 2) + j] = normed_score;
for (int l = 0; l < K; l++) {
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx],
beam_hyps.normed_scores[global_batch_idx * (K * 2) + l]);
}
break;
}
}
}
}
const int tgt_id_offset =
((vector_id + beam_hyps.ite * beam_hyps.local_batch_size) * (K * 2) + beam_idx)
* (beam_hyps.max_seq_len);
beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = beam_hyps.end_ids[vector_id];
if (beam_hyps.log_probs != nullptr) {
beam_hyps.log_probs[tgt_id_offset + beam_hyps.step] =
(float)y[total.p[i]] - old_cum_log_probs[(x[total.p[i]] / vocab_size) % K];
}
int prev_id = (x[total.p[i]] / vocab_size) % K;
for (int j = beam_hyps.step - 1; j >= 0; j--) {
const int src_idx = j * beam_hyps.batch_size * K
+ beam_hyps.ite * beam_hyps.local_batch_size * K + vector_id * K + prev_id;
beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
beam_hyps.log_probs[tgt_id_offset + j] = beam_hyps.log_probs_src[src_idx];
}
prev_id = beam_hyps.parent_ids_src[src_idx];
}
const int tgt_beam_idx = global_batch_idx * (K * 2) + beam_idx;
beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
beam_hyps.normed_scores[tgt_beam_idx] = normed_score;
beam_hyps.min_normed_scores[global_batch_idx] =
min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
beam_hyps.num_beams[global_batch_idx]++;
beam_hyps.cum_log_probs[tgt_beam_idx] = (float)y[total.p[i]];
}
}
else if ((beam_hyps.num_beams != nullptr && i < 2 * K) || (beam_hyps.num_beams == nullptr && i < K)) {
z[selected_beams] = x[total.p[i]];
if (output_log_probs != nullptr) {
output_log_probs[vector_id * K + selected_beams] =
(float)y[total.p[i]] - old_cum_log_probs[(z[selected_beams] / vocab_size) % K];
}
v[selected_beams] = (float)y[total.p[i]];
selected_beams++;
}
__syncthreads();
if (selected_beams >= K) {
break;
}
}
}
if (threadIdx.x == 0 && beam_hyps.num_beams != nullptr) {
if (beam_hyps.num_beams[blockIdx.x] < K) {
beam_hyps.is_done[blockIdx.x] = false;
}
else if (beam_hyps.early_stopping) {
beam_hyps.is_done[blockIdx.x] = true;
}
}
}
struct __align__(8) MD
{
float m;
float d;
};
__device__ __forceinline__ MD reduce_md_op(MD a, MD b)
{
bool a_bigger = (a.m > b.m);
MD bigger_m = a_bigger ? a : b;
MD smaller_m = a_bigger ? b : a;
MD res;
res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.m - bigger_m.m);
res.m = bigger_m.m;
return res;
}
template<typename T, int MAX_K>
struct TopKMD {
MD md;
TopK<T, MAX_K> topk;
};
template<typename T, int MAX_K>
__device__ __forceinline__ TopKMD<T, MAX_K> reduce_topk_md_op(const TopKMD<T, MAX_K>& a, const TopKMD<T, MAX_K>& b)
{
TopKMD<T, MAX_K> res;
res.md = reduce_md_op(a.md, b.md);
res.topk = reduce_topk_op(a.topk, b.topk);
return res;
}
template<typename T, int ITEMS_PER_THREAD, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_online_softmax_topk_kernel(const T* __restrict x,
const T* __restrict b,
const float* __restrict c,
const bool* __restrict finished,
int* __restrict z,
T* __restrict v,
int V,
int K,
const int* __restrict end_ids)
{
int thread_id = threadIdx.x;
int vector_id = blockIdx.x;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
// reposition y to data for the current vector
x += vector_id * V;
typedef cub::BlockReduce<TopKMD<float, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
TopKMD<float, MAX_K> partial;
bool finish = finished[vector_id];
for (int i = 0; i < MAX_K; ++i) {
partial.topk.p[i] = -1;
partial.topk.u[i] = -MAX_T_VAL;
}
partial.md.m = -MAX_T_VAL;
partial.md.d = 0.0F;
if (finish) {
for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
float elem = (elem_id == end_ids[vector_id / K]) ? MAX_T_VAL : -MAX_T_VAL;
MD new_elem{elem, 1.0F};
partial.md = reduce_md_op(partial.md, new_elem);
partial.topk.insert(elem, elem_id);
// if (elem_id > THREADBLOCK_SIZE * MAX_K && (elem_id == E)) break;
}
}
else {
for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
float elem = x[elem_id] + b[elem_id];
MD new_elem{elem, 1.0F};
partial.md = reduce_md_op(partial.md, new_elem);
partial.topk.insert(elem, elem_id);
}
}
TopKMD<float, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<float, MAX_K>);
if (thread_id == 0) {
z += vector_id * K;
v += vector_id * K;
c += vector_id;
// float d_total_inverse = __fdividef(1.0F, total.md.d);
float d_total_log = logf(total.md.d);
for (int i = 0; i < MAX_K; ++i) {
// float val = __expf(total.topk.u[i] - total.md.m) * d_total_inverse;
float val = total.topk.u[i] - total.md.m - d_total_log;
if (i < K) {
z[i] = total.topk.p[i] + vector_id * V; // faster transformer needs absolute id
v[i] = val + c[0];
}
}
}
}
template<typename T, int ITEMS_PER_THREAD, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE, 1) __global__
void beam_online_softmax_topk_stage1_kernel(const T* __restrict x,
const T* __restrict b,
const bool* __restrict finished,
float* __restrict t,
int V,
int K,
const int* __restrict end_ids)
{
int thread_id = threadIdx.x;
int vector_id = blockIdx.x; // batch beam index.
const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
// one will have multiple sections per V
const int v_local = (V + gridDim.y - 1) / gridDim.y;
const int section_start = v_local * blockIdx.y;
int section_end = section_start + v_local;
section_end = (section_end > V) ? V : section_end;
// reposition x to data for the current vector
x += vector_id * V;
#if TOPK_FP16_STORAGE == 1
typedef cub::BlockReduce<TopKMD<__half, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
#else
typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
#endif
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float buf_s[PACKED_TOP_KMD_SIZE]; // save intermediate result
#if TOPK_FP16_STORAGE == 1
TopKMD<__half, MAX_K> partial;
#else
TopKMD<T, MAX_K> partial;
#endif
bool finish = finished[vector_id];
for (int i = 0; i < MAX_K; ++i) {
partial.topk.p[i] = -1;
partial.topk.u[i] = -MAX_T_VAL;
}
partial.md.m = -MAX_T_VAL;
partial.md.d = 0.0F;
if (finish) {
#pragma unroll 1
for (int elem_id = section_start + thread_id; elem_id < section_end; elem_id += THREADBLOCK_SIZE) {
float elem = (elem_id == end_ids[vector_id / K]) ? MAX_T_VAL : -MAX_T_VAL;
MD new_elem{elem, 1.0F};
partial.md = reduce_md_op(partial.md, new_elem);
partial.topk.insert(elem, elem_id);
}
}
else {
#pragma unroll 1
for (int elem_id = section_start + thread_id; elem_id < section_end; elem_id += THREADBLOCK_SIZE) {
T bias = b == nullptr ? (T)0.0f : b[elem_id]; // gpt-2 does not use bias
T elem = x[elem_id] + bias;
MD new_elem{elem, 1.0F};
partial.md = reduce_md_op(partial.md, new_elem);
partial.topk.insert(elem, elem_id);
}
}
#if TOPK_FP16_STORAGE == 1
TopKMD<__half, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<__half, MAX_K>);
#else
TopKMD<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
#endif
if (thread_id == 0) {
for (int i = 0; i < 2 * K; i++) {
reinterpret_cast<int*>(buf_s)[i] = total.topk.p[i] + vector_id * V; // faster transformer needs absolute id
buf_s[MAX_K + i] = total.topk.u[i];
}
buf_s[2 * MAX_K] = total.md.d;
buf_s[2 * MAX_K + 1] = total.md.m;
}
__syncthreads();
for (int elem_id = thread_id; elem_id < PACKED_TOP_KMD_SIZE; elem_id += THREADBLOCK_SIZE) {
t[blockIdx.x * PACKED_TOP_KMD_SIZE * gridDim.y + blockIdx.y * PACKED_TOP_KMD_SIZE + elem_id] = buf_s[elem_id];
}
}
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_online_softmax_topk_stage2_kernel(
const float* __restrict x, const float* __restrict c, int* __restrict z, T* __restrict v, int K, int parts_per_beam)
{
const int vector_id = blockIdx.x;
const int thread_id = threadIdx.x;
const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
extern __shared__ char buf_s_[]; // intermediate result
float* buf_s = reinterpret_cast<float*>(buf_s_);
//__shared__ float buf_s[PACKED_TOP_KMD_SIZE * THREADBLOCK_SIZE]; // intermediate result
typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
x += vector_id * PACKED_TOP_KMD_SIZE * parts_per_beam;
TopKMD<T, MAX_K> partial;
for (int i = 0; i < MAX_K; ++i) {
partial.topk.p[i] = -1;
partial.topk.u[i] = -MAX_T_VAL;
}
partial.md.m = -MAX_T_VAL;
partial.md.d = 0.0F;
// load and unpack into registers through smem
for (int idx = thread_id; idx < PACKED_TOP_KMD_SIZE * parts_per_beam; idx += THREADBLOCK_SIZE) {
buf_s[idx] = x[idx];
}
__syncthreads();
if (threadIdx.x < parts_per_beam) {
float* b_s = buf_s + thread_id * PACKED_TOP_KMD_SIZE;
for (int i = 0; i < 2 * K; i++) {
partial.topk.p[i] = reinterpret_cast<int*>(b_s)[i];
partial.topk.u[i] = b_s[MAX_K + i];
}
partial.md.d = b_s[2 * MAX_K];
partial.md.m = b_s[2 * MAX_K + 1];
}
__syncthreads();
TopKMD<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
if (thread_id == 0) {
z += vector_id * 2 * K;
v += vector_id * 2 * K;
c += vector_id;
float d_total_log = logf(total.md.d);
for (int i = 0; i < MAX_K; ++i) {
float val = (float)total.topk.u[i] - total.md.m - d_total_log;
if (i < 2 * K) {
z[i] = total.topk.p[i];
v[i] = (float)val + (float)c[0];
}
}
}
}
template<typename T, int MAX_K>
void beam_online_softmax_topk_stage2_kernelLauncher(const float* temp_storage,
const float* cum_log_probs,
int* ids,
T* vals,
int batch_size,
int beam_width,
int parts_per_beam,
cudaStream_t stream)
{
// might rewrite beam_online_softmax_topk_stage2_kernel no to depend on constant block size
// in oreder to reduce compilation time
int smem_stage2_size = parts_per_beam * (2 * MAX_K + 2) * sizeof(float);
if (parts_per_beam <= 32) {
beam_online_softmax_topk_stage2_kernel<T, MAX_K, 32><<<batch_size * beam_width, 32, smem_stage2_size, stream>>>(
temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
return;
}
if (parts_per_beam <= 64) {
beam_online_softmax_topk_stage2_kernel<T, MAX_K, 64><<<batch_size * beam_width, 64, smem_stage2_size, stream>>>(
temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
return;
}
if (parts_per_beam <= 128) {
beam_online_softmax_topk_stage2_kernel<T, MAX_K, 128>
<<<batch_size * beam_width, 128, smem_stage2_size, stream>>>(
temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
return;
}
assert(0);
}
template<typename T, int MAX_K>
void topK_softMax_kernelLauncher(const T* log_probs,
const T* bias,
const bool* finished,
const int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
int* ids,
void* temp_storage,
const int temp_storage_size,
BeamHypotheses* beam_hyps,
const int batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
T diversity_rate,
const float length_penalty,
cudaStream_t stream)
{
const int items_per_thread = 1;
const int block_sz = (MAX_K < 16) ? (MAX_K < 8) ? SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE : 128 : 64;
// const int block_sz = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
assert(temp_storage_size % 2 == 0);
assert(temp_storage_size >= 2 * batch_size * beam_width * beam_width * 2);
// Beam search needs the sequence lengths of beams to apply length penalty.
assert(length_penalty == 0.0f || sequence_lengths != nullptr);
const int topk_buf_offset = ceil(batch_size * beam_width * beam_width * 2 / 4.) * 4;
int* topk_tmp_id_buf = reinterpret_cast<int*>(temp_storage);
T* topk_tmp_val_buf = reinterpret_cast<T*>(topk_tmp_id_buf + topk_buf_offset);
float* tmp_buffer = reinterpret_cast<float*>(topk_tmp_val_buf + topk_buf_offset);
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
int voc_parts = 4;
if (batch_size * beam_width < 256) {
// Volta has 80 SMs, so we aim for three waves
voc_parts = (240 + batch_size * beam_width - 1) / (batch_size * beam_width);
voc_parts = std::min(128, voc_parts); // we implement up to 128
}
dim3 grid(batch_size * beam_width, voc_parts);
cudaFuncSetAttribute(beam_online_softmax_topk_stage1_kernel<T, items_per_thread, 2 * MAX_K, block_sz>,
cudaFuncAttributePreferredSharedMemoryCarveout,
cudaSharedmemCarveoutMaxL1);
beam_online_softmax_topk_stage1_kernel<T, items_per_thread, 2 * MAX_K, block_sz>
<<<grid, block_sz, 0, stream>>>(log_probs, bias, finished, tmp_buffer, vocab_size, beam_width, end_ids);
sync_check_cuda_error();
#endif
if (beam_width > 1) {
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
beam_online_softmax_topk_stage2_kernelLauncher<T, 2 * MAX_K>(
tmp_buffer, cum_log_probs, topk_tmp_id_buf, topk_tmp_val_buf, batch_size, beam_width, voc_parts, stream);
sync_check_cuda_error();
#else
beam_online_softmax_topk_kernel<T, items_per_thread, MAX_K, block_sz>
<<<batch_size * beam_width, block_sz, 0, stream>>>(log_probs,
bias,
cum_log_probs,
finished,
topk_tmp_id_buf,
topk_tmp_val_buf,
vocab_size,
beam_width,
end_ids);
#endif
#if 0
// wrong result with diversity_rate != 0.f
batch_topK_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>
(topk_tmp_id_buf, topk_tmp_val_buf, ids, cum_log_probs);
#else
// We need 2*MAX_K candidates because at most k candidates are finished, and we
// will not put them into next iteration
batch_topk_kernel<T, MAX_K * 2, 32><<<batch_size, 32, 0, stream>>>(topk_tmp_id_buf,
topk_tmp_val_buf,
ids,
cum_log_probs,
output_log_probs,
finished,
sequence_lengths,
*beam_hyps,
beam_width * beam_width * 2,
beam_width,
vocab_size,
length_penalty,
diversity_rate);
sync_check_cuda_error();
#endif
}
else {
FT_CHECK(false);
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
beam_online_softmax_topk_stage2_kernelLauncher<float, MAX_K>(
tmp_buffer, cum_log_probs, ids, cum_log_probs, batch_size, beam_width, voc_parts, stream);
#else
beam_online_softmax_topk_kernel<T, items_per_thread, MAX_K, block_sz>
<<<batch_size * beam_width, block_sz, 0, stream>>>(
log_probs, bias, cum_log_probs, finished, ids, cum_log_probs, vocab_size, beam_width, end_ids);
#endif
}
}
#define CASE_K(K, MAX_K) \
case K ... MAX_K: \
topK_softMax_kernelLauncher<T, MAX_K>(log_probs, \
bias, \
finished, \
sequence_lengths, \
cum_log_probs, \
output_log_probs, \
ids, \
temp_storage, \
temp_storage_size, \
beam_hyps, \
batch_size, \
beam_width, \
vocab_size, \
end_ids, \
diversity_rate, \
length_penalty, \
stream); \
break;
template<typename T>
void invokeTopkSoftMax(const T* log_probs,
const T* bias,
const bool* finished,
const int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
int* ids,
void* temp_storage,
const int temp_storage_size,
BeamHypotheses* beam_hyps,
const int batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
const float diversity_rate,
const float length_penalty,
cudaStream_t stream)
{
switch (beam_width) {
CASE_K(1, 4);
CASE_K(5, 8);
CASE_K(9, 16);
CASE_K(17, 32);
CASE_K(33, 64);
default:
throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width));
}
}
#undef CASE_K
template void invokeTopkSoftMax<float>(const float* log_probs,
const float* bias,
const bool* finished,
const int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
int* ids,
void* tmp_storage,
const int temp_storage_size,
BeamHypotheses* beam_hyps,
const int batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
const float diversity_rate,
const float length_penalty,
cudaStream_t stream);
template void invokeTopkSoftMax<half>(const half* log_probs,
const half* bias,
const bool* finished,
const int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
int* ids,
void* tmp_storage,
const int temp_storage_size,
BeamHypotheses* beam_hyps,
const int batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
const float diversity_rate,
const float length_penalty,
cudaStream_t stream);
} // end of namespace turbomind
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
namespace turbomind {
template<typename T>
void invokeTopkSoftMax(const T* log_probs,
const T* bias,
const bool* finished,
const int* sequence_lengths,
float* cum_log_probs,
float* output_log_probs,
int* ids,
void* tmp_storage,
const int temp_storage_size,
BeamHypotheses* beam_hyps,
const int batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
const float diversity_rate,
const float length_penalty,
cudaStream_t stream);
} // namespace turbomind
......@@ -14,13 +14,10 @@
cmake_minimum_required(VERSION 3.8)
add_subdirectory(beam_search_layers)
add_subdirectory(sampling_layers)
add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
TopKSamplingLayer TopPSamplingLayer
OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
gpt_kernels tensor nvtx_utils)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
......@@ -17,11 +17,9 @@
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/kernels/ban_bad_words.h"
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
......@@ -45,37 +43,6 @@ template<typename T>
void DynamicDecodeLayer<T>::initialize()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated
0, // size_per_head, deprecated
0, // beam_width, deprecated
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0.0f, // beam_search_diversity_rate_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
beamsearch_decode_ = new BeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated
0, // size_per_head, deprecated
0, // beam_width, deprecated
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0.0f, // beam_search_diversity_rate_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
topk_decode_ = new TopKSamplingLayer<T>(0,
vocab_size_,
......@@ -131,8 +98,6 @@ template<typename T>
DynamicDecodeLayer<T>::~DynamicDecodeLayer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete online_beamsearch_decode_;
delete beamsearch_decode_;
delete topk_decode_;
delete topp_decode_;
freeBuffer();
......@@ -284,105 +249,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
// dynamic decode GPT
if (beam_width > 1) {
// Because we still not support batch beam search now, so we need to compute one by one if there are different
// runtime arguments.
const size_t dynamic_decode_batch_size = has_diff_runtime_args_ ? 1 : local_batch_size;
const int dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
dynamic_ite < (ite + 1) * dynamic_decode_total_iteration;
++dynamic_ite) {
const int dynamic_id_offset = dynamic_ite * dynamic_decode_batch_size * beam_width;
const int dynamic_decode_vocab_size_units_offset = dynamic_id_offset * vocab_size_padded_;
// common inputs
Tensor logits = input_tensors->at("logits");
Tensor end_id = input_tensors->at("end_id");
TensorMap dynamic_decode_input_tensors(
{{"logits",
Tensor{logits.where,
logits.type,
{dynamic_decode_batch_size, logits.shape[1], logits.shape[2]},
logits.getPtrWithOffset(dynamic_decode_vocab_size_units_offset)}},
{"step", input_tensors->at("step")},
{"max_input_length", input_tensors->at("max_input_length")},
{"end_id",
Tensor{end_id.where,
end_id.type,
{dynamic_decode_batch_size},
end_id.getPtrWithOffset(dynamic_ite * dynamic_decode_batch_size)}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &dynamic_ite}}});
if (input_tensors->isExist("embedding_bias")) {
dynamic_decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
}
if (input_tensors->isExist("input_lengths")) {
Tensor input_lengths = input_tensors->at("input_lengths");
dynamic_decode_input_tensors.insert(
{"input_lengths",
input_lengths.slice({dynamic_decode_batch_size, input_lengths.shape[1]}, dynamic_id_offset)});
}
for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
if (t->first.find("random_seed") == std::string::npos) {
dynamic_decode_input_tensors.insert(*t);
}
}
// common outputs
TensorMap dynamic_decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
if (output_tensors->isExist("sequence_length")) {
Tensor sequence_length = output_tensors->at("sequence_length");
dynamic_decode_output_tensors.insert({"sequence_length",
Tensor{sequence_length.where,
sequence_length.type,
{dynamic_decode_batch_size * beam_width},
sequence_length.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("finished")) {
Tensor finished = output_tensors->at("finished");
dynamic_decode_output_tensors.insert({"finished",
Tensor{finished.where,
finished.type,
{dynamic_decode_batch_size * beam_width},
finished.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("cum_log_probs")) {
Tensor cum_log_probs = output_tensors->at("cum_log_probs");
dynamic_decode_output_tensors.insert({"cum_log_probs",
Tensor{cum_log_probs.where,
cum_log_probs.type,
{dynamic_decode_batch_size * beam_width},
cum_log_probs.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("beam_hyps")) {
dynamic_decode_output_tensors.insert("beam_hyps", output_tensors->at("beam_hyps"));
}
if (output_tensors->isExist("output_log_probs")) {
dynamic_decode_output_tensors.insert({"output_log_probs", output_tensors->at("output_log_probs")});
}
dynamic_decode_input_tensors.insert({"src_cache_indirection", input_tensors->at("src_cache_indirection")});
dynamic_decode_output_tensors.insert({"parent_ids", output_tensors->at("parent_ids")});
dynamic_decode_output_tensors.insert(
{"tgt_cache_indirection", output_tensors->at("tgt_cache_indirection")});
FT_CHECK_WITH_INFO(dynamic_decode_output_tensors.isExist("cum_log_probs"),
"cum_log_probs should be provided in beam search.");
if (true || beam_width < 16
|| (output_tensors->isExist("beam_hyps")
&& input_tensors->getVal<float>("beam_search_diversity_rate", 0.0f) != 0.0f)) {
// only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
online_beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
}
else {
FT_CHECK(false); // deprecate this module
beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
}
} // end of dynamic_ite
FT_CHECK_WITH_INFO(0, "Beam-search is not supported.");
}
else { // beam_width=1
// In sampling, we have supported batch sampling. So, we always compute all sentences once.
......
......@@ -19,7 +19,6 @@
#include <string>
#include <unordered_map>
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
......@@ -34,8 +33,6 @@ protected:
void initialize();
bool hasDiffRuntimeArgs(TensorMap* input_tensors);
DynamicDecodeBaseLayer* online_beamsearch_decode_;
DynamicDecodeBaseLayer* beamsearch_decode_;
DynamicDecodeBaseLayer* topk_decode_;
DynamicDecodeBaseLayer* topp_decode_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment