Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
This diff is collapsed.
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdlib.h>
namespace fastertransformer {
// clang-format off
template<typename T> struct GeluActivation;
template<typename T> struct ReluActivation;
template<typename T> struct SiluActivation;
template<typename T> struct IdentityActivation;
// clang-format on
template<template<typename T> class Activation, typename T, typename BT>
void invokeGenericActivation(T* out,
const BT* bias,
const T* gated_weights,
const BT* gated_bias,
const int* ia3_tasks,
const T* ia3_weights,
const int m,
const int n,
const int int8_mode,
const float* activation_in,
const float* activation_out,
const int* padding_offset,
const int seq_len,
cudaStream_t stream);
template<template<typename T> class Activation, typename T, typename BT>
void invokeGenericActivation(T* out,
const BT* bias,
const T* gated_weights,
const BT* gated_bias,
const int* ia3_tasks,
const T* ia3_weights,
const int m,
const int n,
const int int8_mode,
const float* activation_in,
const float* activation_out,
cudaStream_t stream)
{
invokeGenericActivation<Activation, T, BT>(out,
bias,
gated_weights,
gated_bias,
ia3_tasks,
ia3_weights,
m,
n,
int8_mode,
activation_in,
activation_out,
(const int*)nullptr,
0,
stream);
}
template<typename T>
void invokeAddBiasGeluV2(T* out,
const T* bias,
const int* ia3_tasks,
const T* ia3_weights,
const int* padding_offset,
const int seq_len,
const int m,
const int n,
cudaStream_t stream);
template<typename T>
void invokeAddBias(T* out, T const* bias, const int m, const int n, cudaStream_t stream)
{
invokeGenericActivation<IdentityActivation, T, T>(
out, bias, nullptr, nullptr, nullptr, nullptr, m, n, 0, nullptr, nullptr, stream);
}
template<typename T>
void invokeAddBiasGeluV2(
T* out, const T* bias, const int* ia3_tasks, const T* ia3_weights, const int m, const int n, cudaStream_t stream)
{
invokeAddBiasGeluV2(out, bias, ia3_tasks, ia3_weights, nullptr, 0, m, n, stream);
}
template<typename T>
void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
template<typename T>
void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/kernels/ban_bad_words.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
template<typename T>
__global__ void ban_bad_words(T* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int beam_width,
const int* bad_words,
size_t bad_words_len,
bool share_words,
int id_offset,
int vocab_size_padded,
size_t step)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
const int batch_idx = blockIdx.y / beam_width;
const int beam_idx = blockIdx.y % beam_width;
const int* base_bad_words = share_words ? bad_words : bad_words + batch_idx * 2 * bad_words_len;
const int* base_bad_words_offsets = base_bad_words + bad_words_len;
if (id >= bad_words_len || base_bad_words_offsets[id] < 0) {
return;
}
const int item_end = base_bad_words_offsets[id];
const int item_start = (id > 0) ? base_bad_words_offsets[id - 1] : 0;
const int item_size = item_end - item_start;
/* The single-token case unconditionally bans the token */
bool should_ban = item_size == 1;
/* Multi-token case and enough previously generated tokens to look for a match */
if (item_size > 1 && step >= item_size - 1) {
should_ban = true;
int parent_id = beam_idx;
const bool gather_beam = beam_width > 1;
for (int token_idx = item_size - 2; token_idx >= 0; token_idx--) {
const int previous_token = output_ids_buf[(step - (item_size - 1) + token_idx) * batch_size * beam_width
+ id_offset + batch_idx * beam_width + parent_id];
if (previous_token != base_bad_words[item_start + token_idx]) {
should_ban = false;
break;
}
if (gather_beam) {
parent_id = parent_ids_buf[(step - (item_size - 1) + token_idx) * beam_width * batch_size + id_offset
+ batch_idx * beam_width + parent_id];
if (parent_id < 0 || parent_id >= beam_width) {
should_ban = false;
break;
}
}
}
}
if (should_ban) {
int banned_token = base_bad_words[item_end - 1];
if (0 < banned_token && banned_token < vocab_size_padded) {
logits[batch_idx * beam_width * vocab_size_padded + beam_idx * vocab_size_padded + banned_token] =
static_cast<T>(-INFINITY);
}
}
}
template<typename T>
void invokeBanBadWords(T* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int local_batch_size,
int beam_width,
const int* bad_words,
bool share_words,
size_t bad_words_len,
int id_offset,
int vocab_size_padded,
size_t step,
cudaStream_t stream)
{
dim3 block, grid;
block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL);
grid.x = (bad_words_len + block.x - 1) / block.x;
grid.y = local_batch_size * beam_width;
ban_bad_words<<<grid, block, 0, stream>>>(logits,
output_ids_buf,
parent_ids_buf,
batch_size,
beam_width,
bad_words,
bad_words_len,
share_words,
id_offset,
vocab_size_padded,
step);
sync_check_cuda_error();
}
template void invokeBanBadWords(half* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int local_batch_size,
int beam_width,
const int* bad_words,
bool share_words,
size_t bad_words_len,
int id_offset,
int vocab_size_padded,
size_t step,
cudaStream_t stream);
#ifdef ENABLE_BF16
template void invokeBanBadWords(__nv_bfloat16* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int local_batch_size,
int beam_width,
const int* bad_words,
bool share_words,
size_t bad_words_len,
int id_offset,
int vocab_size_padded,
size_t step,
cudaStream_t stream);
#endif
template void invokeBanBadWords(float* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int local_batch_size,
int beam_width,
const int* bad_words,
bool share_words,
size_t bad_words_len,
int id_offset,
int vocab_size_padded,
size_t step,
cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace fastertransformer {
template<typename T>
void invokeBanBadWords(T* logits,
const int* output_ids_buf,
const int* parent_ids_buf,
int batch_size,
int local_batch_size,
int beam_width,
const int* bad_words,
bool share_words,
size_t bad_words_len,
int id_offset,
int vocab_size_padded,
size_t step,
cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
namespace fastertransformer {
template<typename T>
__global__ void add_bias_temperature(T* logits,
const T* bias,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const float temperature)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bbid = blockIdx.y;
logits += bbid * vocab_size_padded;
const T MASK_VAL = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
const T inv_temp = static_cast<T>(1.0f / (temperature + 1e-6f));
for (int i = tid + bid * blockDim.x; i < vocab_size_padded; i += blockDim.x * gridDim.x) {
if (i < vocab_size) {
T bias_val = bias == nullptr ? (T)(0.0f) : bias[i];
logits[i] = (logits[i] + bias_val) * inv_temp;
}
else {
logits[i] = MASK_VAL;
}
}
}
template<>
__global__ void add_bias_temperature(half2* logits,
const half2* bias,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const float temperature)
{
assert(vocab_size % 2 == 0);
assert(vocab_size_padded % 2 == 0);
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int bbid = blockIdx.y;
const half2 mask_val = __float2half2_rn(-HALF_FLT_MAX);
const half2 inv_temp = __float2half2_rn(1.0f / (temperature + 1e-6f));
const int half_vocab_size = vocab_size / 2;
const int half_vocab_size_padded = vocab_size_padded / 2;
logits += bbid * half_vocab_size_padded;
for (int index = tid + bid * blockDim.x; index < half_vocab_size_padded; index += blockDim.x * gridDim.x) {
int vocab_idx = index % half_vocab_size_padded;
half2 logit = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
if (vocab_idx < half_vocab_size) {
if (bias != nullptr) {
logit = __hadd2(logit, bias[vocab_idx]);
}
logit = __hmul2(logit, inv_temp);
}
logits[index] = logit;
}
}
template<typename T, bool IS_ADDITIVE>
__global__ void apply_repetition_penalty(T* logits,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int step,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int max_input_length,
const float repetition_penalty)
{
assert(step > 0);
const int tid = threadIdx.x;
const int bbid = blockIdx.x;
const int batch_id = bbid / beam_width;
const int bbsize = batch_size * beam_width;
logits += bbid * vocab_size_padded;
extern __shared__ char sbuf[];
T* penalty_logits = reinterpret_cast<T*>(sbuf);
// prevent misaligment when sizeof(T) = 2
int* penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * step + 31) / 32 * 32);
const int input_length = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
if (tid == 0) {
T repet_penalty = static_cast<T>(repetition_penalty);
int prev_id = current_ids[bbid];
T prev_logit = logits[prev_id];
penalty_indices[step - 1] = prev_id;
if (IS_ADDITIVE) {
penalty_logits[step - 1] = prev_logit - repet_penalty;
}
else {
penalty_logits[step - 1] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
}
if (step > 1) {
int parent_beam = bbid % beam_width;
for (int i = step - 2; i >= 0; --i) {
// Skip the padded tokens.
if (i >= input_length && i < max_input_length) {
continue;
}
parent_beam = parent_ids[i * bbsize + batch_id * beam_width + parent_beam];
prev_id = previous_ids[i * bbsize + batch_id * beam_width + parent_beam];
prev_logit = logits[prev_id];
penalty_indices[i] = prev_id;
if (IS_ADDITIVE) {
penalty_logits[i] = prev_logit - repet_penalty;
}
else {
penalty_logits[i] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
}
}
}
}
__syncthreads();
for (int i = tid; i < step; i += blockDim.x) {
if (i >= input_length && i < max_input_length) {
continue;
}
logits[penalty_indices[i]] = penalty_logits[i];
}
}
template<typename T>
__global__ void apply_min_length_penalty(T* logits,
const int min_length,
const int* end_ids,
const int* sequence_lengths,
const int max_input_length,
const int beam_width,
const int vocab_size_padded)
{
int bbid = threadIdx.x + blockIdx.x * blockDim.x; // batch-beam index
int bid = bbid / beam_width; // batch index
// We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
// which is equal to the length of k/v caches.
if (sequence_lengths[bbid] + 1 - max_input_length < min_length) {
T mask_val = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
}
}
template<typename T>
void invokeAddBiasApplyPenalties(int step,
T* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const T* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream)
{
if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) {
dim3 block(512);
if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) {
dim3 grid((vocab_size_padded / 2 + block.x - 1) / block.x, beam_width * local_batch_size);
add_bias_temperature<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
reinterpret_cast<const half2*>(bias),
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
temperature);
}
else {
dim3 grid((vocab_size_padded + block.x - 1) / block.x, beam_width * local_batch_size);
add_bias_temperature<<<grid, block, 0, stream>>>(
logits, bias, batch_size, beam_width, vocab_size, vocab_size_padded, temperature);
}
}
if (repetition_penalty_type != RepetitionPenaltyType::None && step > 0) {
if (repetition_penalty != getDefaultPenaltyValue(repetition_penalty_type)) {
size_t smem_size = (sizeof(T) * step + 31) / 32 * 32 + sizeof(int) * step;
dim3 block(256);
dim3 grid(beam_width * local_batch_size);
if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) {
apply_repetition_penalty<T, false>
<<<grid, block, smem_size, stream>>>(logits,
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
step,
current_ids,
previous_ids,
// TODO(jaedeokk):
// Remove (+ite ...) by getting parent_ids with offset
// and then remove 'ite' argument from the function.
parent_ids + ite * beam_width * local_batch_size,
input_lengths,
max_input_length,
repetition_penalty);
}
else if (repetition_penalty_type == RepetitionPenaltyType::Additive) {
apply_repetition_penalty<T, true>
<<<grid, block, smem_size, stream>>>(logits,
batch_size,
beam_width,
vocab_size,
vocab_size_padded,
step,
current_ids,
previous_ids,
parent_ids + ite * beam_width * local_batch_size,
input_lengths,
max_input_length,
repetition_penalty);
}
}
}
if (step - max_input_length < min_length) {
FT_CHECK_WITH_INFO(sequence_lengths != nullptr, "Need sequence_lengths to apply min length penlaty");
FT_CHECK_WITH_INFO(end_ids != nullptr, "Need end_id to apply min length penlaty");
const int block_size = min(local_batch_size * beam_width, 1024);
const int grid_size = (local_batch_size * beam_width + block_size - 1) / block_size;
apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
}
}
template void invokeAddBiasApplyPenalties(int step,
float* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const float* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
template void invokeAddBiasApplyPenalties(int step,
half* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const half* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_fp16.h>
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
template<typename T>
void invokeAddBiasApplyPenalties(int step,
T* logits,
const int* current_ids,
const int* previous_ids,
const int* parent_ids,
const int* input_lengths,
const int* sequence_lengths,
const T* bias,
const int ite,
const int max_input_length,
const int local_batch_size,
const int batch_size,
const int beam_width,
const int vocab_size,
const int vocab_size_padded,
const int* end_ids,
const float temperature,
const float repetition_penalty,
const RepetitionPenaltyType repetition_penalty_type,
const int min_length,
cudaStream_t stream);
} // namespace fastertransformer
This diff is collapsed.
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda_runtime.h>
#pragma once
namespace fastertransformer {
// In original beam search implementation, if a beam is finished, we set it as finished
// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
//
// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
// next step.
//
// After we collect `beam_width` beams, we will sort them by their norm_scores.
struct BeamHypotheses {
int* output_ids_tgt = nullptr;
int* sequence_lengths_tgt = nullptr;
float* cum_log_probs = nullptr; // cum_log
float* normed_scores = nullptr; // cum_log / (length**length_penalty)
float* log_probs = nullptr; // log probs of each generated token
float* min_normed_scores = nullptr; // record the min normed scores for each batch
int* num_beams = nullptr; // the number of finished beams we collect
bool* is_done = nullptr;
// Used to set inputs
const int* output_ids_src;
const int* parent_ids_src;
const int* sequence_lengths_src;
const int* end_ids;
const float* log_probs_src;
// some variables for kernels
int step;
int ite;
int batch_size;
int local_batch_size;
int max_seq_len;
float length_penalty;
bool early_stopping = true;
bool is_return_normed_score = true; // return normed_cum_log_probs or cum_log_probs
};
template<typename T>
void invokeTopkBeamSearch(void* workspace,
size_t& workspace_size,
T* log_probs,
int* ids,
BeamHypotheses* beam_hyps,
const bool* finished,
const int* sequence_lengths,
const int batch_size,
const int beam_width,
const int vocab_size_padded_,
const T diversity_rate,
const float length_penalty,
const int* end_ids,
cudaStream_t stream);
template<typename T>
void invokeTileEncoderResults(T* tiled_encoder_output,
int* tiled_encoder_sequence_length,
const T* encoder_output,
const int* encoder_sequence_length,
const size_t batch_size,
const size_t beam_width,
const size_t mem_max_seq_len,
const size_t d_model,
cudaStream_t stream);
void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
const bool* finished,
const float* cum_log_probs,
const int batch_size,
const int beam_width,
cudaStream_t stream);
} // namespace fastertransformer
This diff is collapsed.
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifdef ENABLE_FP8
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#endif // ENABLE_FP8
namespace fastertransformer {
void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
size_t* h_token_num,
int* tmp_mask_offset,
int* cu_seqlens,
const int* sequence_length,
const int batch_size,
const int max_seq_len,
cudaStream_t stream);
inline void invokeGetPaddingOffset(size_t* h_pinned_token_num,
size_t* h_token_num,
int* tmp_mask_offset,
const int* sequence_length,
const int batch_size,
const int max_seq_len,
cudaStream_t stream)
{
invokeGetPaddingOffsetAndCuSeqLens(
h_pinned_token_num, h_token_num, tmp_mask_offset, nullptr, sequence_length, batch_size, max_seq_len, stream);
}
template<typename T>
void invokeBuildEncoderAttentionMask(
T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream);
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
cudaStream_t stream);
void invokeGetTrtPaddingOffset(int* trt_mha_padding_offset,
const int* sequence_length,
const int request_batch_size,
const int request_seq_len,
cudaStream_t stream);
template<typename T>
void invokeRebuildPadding(
T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
template<typename T>
void invokeRemovePadding(
T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
template<typename T>
void invokeBuildRelativeAttentionBias(T* relative_attention_bias,
const T* relative_attention_bias_table,
const int head_num,
const int seq_len,
const int num_bucket,
const bool is_bidirectional,
const int max_distance,
const PositionEmbeddingType position_embedding_type,
cudaStream_t stream);
template<typename T_OUT, typename T_IN>
struct getLastTokenDequantizeParam {
T_OUT* const output;
T_IN const* const input;
float const* const input_scale;
const int batch_size;
const int max_seq_len;
const int d_model;
cudaStream_t stream;
};
template<typename T_OUT, typename T_IN>
void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param);
#ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
struct QuantizeMatrixRebuildPaddingParam {
T_OUT* dst;
const T_IN* src;
const int* padding_offset;
const int token_num;
const int d_model;
const float* scale;
cudaStream_t stream;
};
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
#endif // ENABLE_FP8
} // namespace fastertransformer
This diff is collapsed.
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <cuda_fp16.h>
#include <iostream>
#include "src/fastertransformer/utils/cuda_utils.h"
#define CUSTOM_AR_SIZE_THRESHOLD 50331648
#define MAX_ALL_REDUCE_BLOCKS 24
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define RANKS_PER_NODE 8
#define WARP_SIZE 32
#define DEFAULT_BLOCK_SIZE 1024
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
namespace fastertransformer {
#ifdef ENABLE_BF16
typedef struct bf168 {
__nv_bfloat162 x;
__nv_bfloat162 y;
__nv_bfloat162 z;
__nv_bfloat162 w;
} bf168;
#endif
template<typename T>
struct AllReduceParams {
size_t elts_total;
size_t elts_per_rank;
size_t elts_per_block;
size_t rank_offset;
size_t rank, local_rank, node_id;
uint32_t barrier_flag;
uint32_t* peer_barrier_ptrs[RANKS_PER_NODE];
T* peer_comm_buffer_ptrs[RANKS_PER_NODE];
T* local_output_buffer_ptr;
};
template<typename T>
void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t stream);
void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include <assert.h>
#include <float.h>
#include <type_traits>
template<typename T, typename KERNEL_PARAMS_TYPE>
void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
{
switch (params.hidden_size_per_head) {
case 128:
mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
break;
default:
assert(false);
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
{
multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
{
multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_BF16
void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
const cudaStream_t& stream)
{
multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
}
#endif
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
////////////////////////////////////////////////////////////////////////////////////////////////////
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
exit(1); \
} \
} while (0)
////////////////////////////////////////////////////////////////////////////////////////////////////
// The structure of parameters for the masked multihead attention kernel.
//
// We use the following terminology to describe the different dimensions.
//
// B: Batch size (number of sequences),
// L: Sequence length,
// D: Hidden dimension,
// H: Number of heads,
// Dh: Hidden dimension per head - Dh = D / H.
template<typename T>
struct Multihead_attention_params_base {
// The output buffer. Dimensions B x D.
T* out = nullptr;
// The input Qs and the associated bias. Dimensions B x D and D, resp.
const T *q = nullptr, *q_bias = nullptr;
// The input Ks and the associated bias. Dimensions B x D and D, resp.
const T *k = nullptr, *k_bias = nullptr;
// The input Vs and the associated bias. Dimensions B x D and D, resp.
const T *v = nullptr, *v_bias = nullptr;
// The cache for the Ks. The size must be at least B x L x D.
T* k_cache = nullptr;
// The cache for the Vs. The size must be at least B x L x D.
T* v_cache = nullptr;
// The indirections to use for cache when beam sampling.
const int* cache_indir = nullptr;
// scales
const float* query_weight_output_scale = nullptr;
const float* attention_qk_scale = nullptr;
const float* attention_output_weight_input_scale_inv = nullptr;
// Stride to handle the case when KQV is a single buffer
int stride = 0;
// The batch size.
int batch_size = 0;
// The beam width
int beam_width = 0;
// The sequence length.
int memory_max_len = 0;
// The number of heads (H).
int num_heads = 0;
// The hidden dimension per head (Dh).
int hidden_size_per_head = 0;
// The per-head latent space reserved for rotary embeddings.
int rotary_embedding_dim = 0;
// The maximum length of input sentences.
int max_input_length = 0;
// The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
int timestep = 0;
// The current timestep of each sentences (support different timestep for different sentences)
// The 1.f / sqrt(Dh). Computed on the host.
float inv_sqrt_dh = 0.0f;
// Used when we have some input context like gpt
const int* total_padding_tokens = nullptr;
const bool* masked_tokens = nullptr;
const int* prefix_prompt_lengths = nullptr;
int max_prefix_prompt_length = 0;
const T* relative_attention_bias = nullptr;
int relative_attention_bias_stride = 0;
// The slope per head of linear position bias to attention score (H).
const T* linear_bias_slopes = nullptr;
const T* ia3_key_weights = nullptr;
const T* ia3_value_weights = nullptr;
const int* ia3_tasks = nullptr;
const float* qkv_scale_out = nullptr;
const float* attention_out_scale = nullptr;
int int8_mode = 0;
};
template<typename T>
struct Multihead_attention_params: public Multihead_attention_params_base<T> {
// allows to exist attention eary
bool* finished = nullptr;
// required in case of masked attention with different length
const int* length_per_sample = nullptr;
T** k_cache_per_sample = nullptr;
T** v_cache_per_sample = nullptr;
size_t kv_cache_per_sample_offset = 0;
bool k_cache_interleaved = true;
};
template<class T>
using Masked_multihead_attention_params = Multihead_attention_params<T>;
////////////////////////////////////////////////////////////////////////////////////////////////////
void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
#ifdef ENABLE_BF16
void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
const cudaStream_t& stream);
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment