check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/kernels/activation_kernels.cu
+++ b/src/fastertransformer/kernels/activation_kernels.cu
--- a/src/fastertransformer/kernels/activation_kernels.h
+++ b/src/fastertransformer/kernels/activation_kernels.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+namespace fastertransformer {
+
+// clang-format off
+template<typename T> struct GeluActivation;
+template<typename T> struct ReluActivation;
+template<typename T> struct SiluActivation;
+template<typename T> struct IdentityActivation;
+// clang-format on
+
+template<template<typename T> class Activation, typename T, typename BT>
+void invokeGenericActivation(T*           out,
+                             const BT*    bias,
+                             const T*     gated_weights,
+                             const BT*    gated_bias,
+                             const int*   ia3_tasks,
+                             const T*     ia3_weights,
+                             const int    m,
+                             const int    n,
+                             const int    int8_mode,
+                             const float* activation_in,
+                             const float* activation_out,
+                             const int*   padding_offset,
+                             const int    seq_len,
+                             cudaStream_t stream);
+
+template<template<typename T> class Activation, typename T, typename BT>
+void invokeGenericActivation(T*           out,
+                             const BT*    bias,
+                             const T*     gated_weights,
+                             const BT*    gated_bias,
+                             const int*   ia3_tasks,
+                             const T*     ia3_weights,
+                             const int    m,
+                             const int    n,
+                             const int    int8_mode,
+                             const float* activation_in,
+                             const float* activation_out,
+                             cudaStream_t stream)
+{
+    invokeGenericActivation<Activation, T, BT>(out,
+                                               bias,
+                                               gated_weights,
+                                               gated_bias,
+                                               ia3_tasks,
+                                               ia3_weights,
+                                               m,
+                                               n,
+                                               int8_mode,
+                                               activation_in,
+                                               activation_out,
+                                               (const int*)nullptr,
+                                               0,
+                                               stream);
+}
+
+template<typename T>
+void invokeAddBiasGeluV2(T*           out,
+                         const T*     bias,
+                         const int*   ia3_tasks,
+                         const T*     ia3_weights,
+                         const int*   padding_offset,
+                         const int    seq_len,
+                         const int    m,
+                         const int    n,
+                         cudaStream_t stream);
+
+template<typename T>
+void invokeAddBias(T* out, T const* bias, const int m, const int n, cudaStream_t stream)
+{
+    invokeGenericActivation<IdentityActivation, T, T>(
+        out, bias, nullptr, nullptr, nullptr, nullptr, m, n, 0, nullptr, nullptr, stream);
+}
+
+template<typename T>
+void invokeAddBiasGeluV2(
+    T* out, const T* bias, const int* ia3_tasks, const T* ia3_weights, const int m, const int n, cudaStream_t stream)
+{
+    invokeAddBiasGeluV2(out, bias, ia3_tasks, ia3_weights, nullptr, 0, m, n, stream);
+}
+
+template<typename T>
+void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
+
+template<typename T>
+void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/ban_bad_words.cu
+++ b/src/fastertransformer/kernels/ban_bad_words.cu
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/ban_bad_words.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void ban_bad_words(T*         logits,
+                              const int* output_ids_buf,
+                              const int* parent_ids_buf,
+                              int        batch_size,
+                              int        beam_width,
+                              const int* bad_words,
+                              size_t     bad_words_len,
+                              bool       share_words,
+                              int        id_offset,
+                              int        vocab_size_padded,
+                              size_t     step)
+{
+    const int id        = blockIdx.x * blockDim.x + threadIdx.x;
+    const int batch_idx = blockIdx.y / beam_width;
+    const int beam_idx  = blockIdx.y % beam_width;
+
+    const int* base_bad_words         = share_words ? bad_words : bad_words + batch_idx * 2 * bad_words_len;
+    const int* base_bad_words_offsets = base_bad_words + bad_words_len;
+
+    if (id >= bad_words_len || base_bad_words_offsets[id] < 0) {
+        return;
+    }
+
+    const int item_end   = base_bad_words_offsets[id];
+    const int item_start = (id > 0) ? base_bad_words_offsets[id - 1] : 0;
+    const int item_size  = item_end - item_start;
+
+    /* The single-token case unconditionally bans the token */
+    bool should_ban = item_size == 1;
+
+    /* Multi-token case and enough previously generated tokens to look for a match */
+    if (item_size > 1 && step >= item_size - 1) {
+        should_ban             = true;
+        int        parent_id   = beam_idx;
+        const bool gather_beam = beam_width > 1;
+
+        for (int token_idx = item_size - 2; token_idx >= 0; token_idx--) {
+            const int previous_token = output_ids_buf[(step - (item_size - 1) + token_idx) * batch_size * beam_width
+                                                      + id_offset + batch_idx * beam_width + parent_id];
+
+            if (previous_token != base_bad_words[item_start + token_idx]) {
+                should_ban = false;
+                break;
+            }
+            if (gather_beam) {
+                parent_id = parent_ids_buf[(step - (item_size - 1) + token_idx) * beam_width * batch_size + id_offset
+                                           + batch_idx * beam_width + parent_id];
+
+                if (parent_id < 0 || parent_id >= beam_width) {
+                    should_ban = false;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (should_ban) {
+        int banned_token = base_bad_words[item_end - 1];
+        if (0 < banned_token && banned_token < vocab_size_padded) {
+            logits[batch_idx * beam_width * vocab_size_padded + beam_idx * vocab_size_padded + banned_token] =
+                static_cast<T>(-INFINITY);
+        }
+    }
+}
+
+template<typename T>
+void invokeBanBadWords(T*           logits,
+                       const int*   output_ids_buf,
+                       const int*   parent_ids_buf,
+                       int          batch_size,
+                       int          local_batch_size,
+                       int          beam_width,
+                       const int*   bad_words,
+                       bool         share_words,
+                       size_t       bad_words_len,
+                       int          id_offset,
+                       int          vocab_size_padded,
+                       size_t       step,
+                       cudaStream_t stream)
+{
+    dim3 block, grid;
+    block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL);
+    grid.x  = (bad_words_len + block.x - 1) / block.x;
+    grid.y  = local_batch_size * beam_width;
+
+    ban_bad_words<<<grid, block, 0, stream>>>(logits,
+                                              output_ids_buf,
+                                              parent_ids_buf,
+                                              batch_size,
+                                              beam_width,
+                                              bad_words,
+                                              bad_words_len,
+                                              share_words,
+                                              id_offset,
+                                              vocab_size_padded,
+                                              step);
+    sync_check_cuda_error();
+}
+
+template void invokeBanBadWords(half*        logits,
+                                const int*   output_ids_buf,
+                                const int*   parent_ids_buf,
+                                int          batch_size,
+                                int          local_batch_size,
+                                int          beam_width,
+                                const int*   bad_words,
+                                bool         share_words,
+                                size_t       bad_words_len,
+                                int          id_offset,
+                                int          vocab_size_padded,
+                                size_t       step,
+                                cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeBanBadWords(__nv_bfloat16* logits,
+                                const int*     output_ids_buf,
+                                const int*     parent_ids_buf,
+                                int            batch_size,
+                                int            local_batch_size,
+                                int            beam_width,
+                                const int*     bad_words,
+                                bool           share_words,
+                                size_t         bad_words_len,
+                                int            id_offset,
+                                int            vocab_size_padded,
+                                size_t         step,
+                                cudaStream_t   stream);
+#endif
+template void invokeBanBadWords(float*       logits,
+                                const int*   output_ids_buf,
+                                const int*   parent_ids_buf,
+                                int          batch_size,
+                                int          local_batch_size,
+                                int          beam_width,
+                                const int*   bad_words,
+                                bool         share_words,
+                                size_t       bad_words_len,
+                                int          id_offset,
+                                int          vocab_size_padded,
+                                size_t       step,
+                                cudaStream_t stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/ban_bad_words.h
+++ b/src/fastertransformer/kernels/ban_bad_words.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeBanBadWords(T*           logits,
+                       const int*   output_ids_buf,
+                       const int*   parent_ids_buf,
+                       int          batch_size,
+                       int          local_batch_size,
+                       int          beam_width,
+                       const int*   bad_words,
+                       bool         share_words,
+                       size_t       bad_words_len,
+                       int          id_offset,
+                       int          vocab_size_padded,
+                       size_t       step,
+                       cudaStream_t stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/beam_search_penalty_kernels.cu
+++ b/src/fastertransformer/kernels/beam_search_penalty_kernels.cu
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void add_bias_temperature(T*          logits,
+                                     const T*    bias,
+                                     const int   batch_size,
+                                     const int   beam_width,
+                                     const int   vocab_size,
+                                     const int   vocab_size_padded,
+                                     const float temperature)
+{
+    int tid  = threadIdx.x;
+    int bid  = blockIdx.x;
+    int bbid = blockIdx.y;
+
+    logits += bbid * vocab_size_padded;
+
+    const T MASK_VAL = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
+    const T inv_temp = static_cast<T>(1.0f / (temperature + 1e-6f));
+    for (int i = tid + bid * blockDim.x; i < vocab_size_padded; i += blockDim.x * gridDim.x) {
+        if (i < vocab_size) {
+            T bias_val = bias == nullptr ? (T)(0.0f) : bias[i];
+            logits[i]  = (logits[i] + bias_val) * inv_temp;
+        }
+        else {
+            logits[i] = MASK_VAL;
+        }
+    }
+}
+
+template<>
+__global__ void add_bias_temperature(half2*       logits,
+                                     const half2* bias,
+                                     const int    batch_size,
+                                     const int    beam_width,
+                                     const int    vocab_size,
+                                     const int    vocab_size_padded,
+                                     const float  temperature)
+{
+    assert(vocab_size % 2 == 0);
+    assert(vocab_size_padded % 2 == 0);
+
+    const int tid  = threadIdx.x;
+    const int bid  = blockIdx.x;
+    const int bbid = blockIdx.y;
+
+    const half2 mask_val = __float2half2_rn(-HALF_FLT_MAX);
+    const half2 inv_temp = __float2half2_rn(1.0f / (temperature + 1e-6f));
+
+    const int half_vocab_size        = vocab_size / 2;
+    const int half_vocab_size_padded = vocab_size_padded / 2;
+
+    logits += bbid * half_vocab_size_padded;
+    for (int index = tid + bid * blockDim.x; index < half_vocab_size_padded; index += blockDim.x * gridDim.x) {
+        int   vocab_idx = index % half_vocab_size_padded;
+        half2 logit     = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
+        if (vocab_idx < half_vocab_size) {
+            if (bias != nullptr) {
+                logit = __hadd2(logit, bias[vocab_idx]);
+            }
+            logit = __hmul2(logit, inv_temp);
+        }
+        logits[index] = logit;
+    }
+}
+
+template<typename T, bool IS_ADDITIVE>
+__global__ void apply_repetition_penalty(T*          logits,
+                                         const int   batch_size,
+                                         const int   beam_width,
+                                         const int   vocab_size,
+                                         const int   vocab_size_padded,
+                                         const int   step,
+                                         const int*  current_ids,
+                                         const int*  previous_ids,
+                                         const int*  parent_ids,
+                                         const int*  input_lengths,
+                                         const int   max_input_length,
+                                         const float repetition_penalty)
+{
+    assert(step > 0);
+
+    const int tid      = threadIdx.x;
+    const int bbid     = blockIdx.x;
+    const int batch_id = bbid / beam_width;
+    const int bbsize   = batch_size * beam_width;
+
+    logits += bbid * vocab_size_padded;
+    extern __shared__ char sbuf[];
+    T*                     penalty_logits = reinterpret_cast<T*>(sbuf);
+    // prevent misaligment when sizeof(T) = 2
+    int*      penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * step + 31) / 32 * 32);
+    const int input_length    = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
+    if (tid == 0) {
+        T   repet_penalty         = static_cast<T>(repetition_penalty);
+        int prev_id               = current_ids[bbid];
+        T   prev_logit            = logits[prev_id];
+        penalty_indices[step - 1] = prev_id;
+
+        if (IS_ADDITIVE) {
+            penalty_logits[step - 1] = prev_logit - repet_penalty;
+        }
+        else {
+            penalty_logits[step - 1] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
+        }
+        if (step > 1) {
+            int parent_beam = bbid % beam_width;
+            for (int i = step - 2; i >= 0; --i) {
+                // Skip the padded tokens.
+                if (i >= input_length && i < max_input_length) {
+                    continue;
+                }
+                parent_beam        = parent_ids[i * bbsize + batch_id * beam_width + parent_beam];
+                prev_id            = previous_ids[i * bbsize + batch_id * beam_width + parent_beam];
+                prev_logit         = logits[prev_id];
+                penalty_indices[i] = prev_id;
+                if (IS_ADDITIVE) {
+                    penalty_logits[i] = prev_logit - repet_penalty;
+                }
+                else {
+                    penalty_logits[i] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
+                }
+            }
+        }
+    }
+    __syncthreads();
+    for (int i = tid; i < step; i += blockDim.x) {
+        if (i >= input_length && i < max_input_length) {
+            continue;
+        }
+        logits[penalty_indices[i]] = penalty_logits[i];
+    }
+}
+
+template<typename T>
+__global__ void apply_min_length_penalty(T*         logits,
+                                         const int  min_length,
+                                         const int* end_ids,
+                                         const int* sequence_lengths,
+                                         const int  max_input_length,
+                                         const int  beam_width,
+                                         const int  vocab_size_padded)
+{
+    int bbid = threadIdx.x + blockIdx.x * blockDim.x;  // batch-beam index
+    int bid  = bbid / beam_width;                      // batch index
+    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
+    // which is equal to the length of k/v caches.
+    if (sequence_lengths[bbid] + 1 - max_input_length < min_length) {
+        T mask_val                                      = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
+        logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
+    }
+}
+
+template<typename T>
+void invokeAddBiasApplyPenalties(int                         step,
+                                 T*                          logits,
+                                 const int*                  current_ids,
+                                 const int*                  previous_ids,
+                                 const int*                  parent_ids,
+                                 const int*                  input_lengths,
+                                 const int*                  sequence_lengths,
+                                 const T*                    bias,
+                                 const int                   ite,
+                                 const int                   max_input_length,
+                                 const int                   local_batch_size,
+                                 const int                   batch_size,
+                                 const int                   beam_width,
+                                 const int                   vocab_size,
+                                 const int                   vocab_size_padded,
+                                 const int*                  end_ids,
+                                 const float                 temperature,
+                                 const float                 repetition_penalty,
+                                 const RepetitionPenaltyType repetition_penalty_type,
+                                 const int                   min_length,
+                                 cudaStream_t                stream)
+{
+    if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) {
+        dim3 block(512);
+        if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) {
+            dim3 grid((vocab_size_padded / 2 + block.x - 1) / block.x, beam_width * local_batch_size);
+            add_bias_temperature<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
+                                                             reinterpret_cast<const half2*>(bias),
+                                                             batch_size,
+                                                             beam_width,
+                                                             vocab_size,
+                                                             vocab_size_padded,
+                                                             temperature);
+        }
+        else {
+            dim3 grid((vocab_size_padded + block.x - 1) / block.x, beam_width * local_batch_size);
+            add_bias_temperature<<<grid, block, 0, stream>>>(
+                logits, bias, batch_size, beam_width, vocab_size, vocab_size_padded, temperature);
+        }
+    }
+
+    if (repetition_penalty_type != RepetitionPenaltyType::None && step > 0) {
+        if (repetition_penalty != getDefaultPenaltyValue(repetition_penalty_type)) {
+            size_t smem_size = (sizeof(T) * step + 31) / 32 * 32 + sizeof(int) * step;
+            dim3   block(256);
+            dim3   grid(beam_width * local_batch_size);
+            if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) {
+                apply_repetition_penalty<T, false>
+                    <<<grid, block, smem_size, stream>>>(logits,
+                                                         batch_size,
+                                                         beam_width,
+                                                         vocab_size,
+                                                         vocab_size_padded,
+                                                         step,
+                                                         current_ids,
+                                                         previous_ids,
+                                                         // TODO(jaedeokk):
+                                                         //   Remove (+ite ...) by getting parent_ids with offset
+                                                         //   and then remove 'ite' argument from the function.
+                                                         parent_ids + ite * beam_width * local_batch_size,
+                                                         input_lengths,
+                                                         max_input_length,
+                                                         repetition_penalty);
+            }
+            else if (repetition_penalty_type == RepetitionPenaltyType::Additive) {
+                apply_repetition_penalty<T, true>
+                    <<<grid, block, smem_size, stream>>>(logits,
+                                                         batch_size,
+                                                         beam_width,
+                                                         vocab_size,
+                                                         vocab_size_padded,
+                                                         step,
+                                                         current_ids,
+                                                         previous_ids,
+                                                         parent_ids + ite * beam_width * local_batch_size,
+                                                         input_lengths,
+                                                         max_input_length,
+                                                         repetition_penalty);
+            }
+        }
+    }
+
+    if (step - max_input_length < min_length) {
+        FT_CHECK_WITH_INFO(sequence_lengths != nullptr, "Need sequence_lengths to apply min length penlaty");
+        FT_CHECK_WITH_INFO(end_ids != nullptr, "Need end_id to apply min length penlaty");
+
+        const int block_size = min(local_batch_size * beam_width, 1024);
+        const int grid_size  = (local_batch_size * beam_width + block_size - 1) / block_size;
+        apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
+            logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
+    }
+}
+
+template void invokeAddBiasApplyPenalties(int                         step,
+                                          float*                      logits,
+                                          const int*                  current_ids,
+                                          const int*                  previous_ids,
+                                          const int*                  parent_ids,
+                                          const int*                  input_lengths,
+                                          const int*                  sequence_lengths,
+                                          const float*                bias,
+                                          const int                   ite,
+                                          const int                   max_input_length,
+                                          const int                   local_batch_size,
+                                          const int                   batch_size,
+                                          const int                   beam_width,
+                                          const int                   vocab_size,
+                                          const int                   vocab_size_padded,
+                                          const int*                  end_ids,
+                                          const float                 temperature,
+                                          const float                 repetition_penalty,
+                                          const RepetitionPenaltyType repetition_penalty_type,
+                                          const int                   min_length,
+                                          cudaStream_t                stream);
+
+template void invokeAddBiasApplyPenalties(int                         step,
+                                          half*                       logits,
+                                          const int*                  current_ids,
+                                          const int*                  previous_ids,
+                                          const int*                  parent_ids,
+                                          const int*                  input_lengths,
+                                          const int*                  sequence_lengths,
+                                          const half*                 bias,
+                                          const int                   ite,
+                                          const int                   max_input_length,
+                                          const int                   local_batch_size,
+                                          const int                   batch_size,
+                                          const int                   beam_width,
+                                          const int                   vocab_size,
+                                          const int                   vocab_size_padded,
+                                          const int*                  end_ids,
+                                          const float                 temperature,
+                                          const float                 repetition_penalty,
+                                          const RepetitionPenaltyType repetition_penalty_type,
+                                          const int                   min_length,
+                                          cudaStream_t                stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/beam_search_penalty_kernels.h
+++ b/src/fastertransformer/kernels/beam_search_penalty_kernels.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeAddBiasApplyPenalties(int                         step,
+                                 T*                          logits,
+                                 const int*                  current_ids,
+                                 const int*                  previous_ids,
+                                 const int*                  parent_ids,
+                                 const int*                  input_lengths,
+                                 const int*                  sequence_lengths,
+                                 const T*                    bias,
+                                 const int                   ite,
+                                 const int                   max_input_length,
+                                 const int                   local_batch_size,
+                                 const int                   batch_size,
+                                 const int                   beam_width,
+                                 const int                   vocab_size,
+                                 const int                   vocab_size_padded,
+                                 const int*                  end_ids,
+                                 const float                 temperature,
+                                 const float                 repetition_penalty,
+                                 const RepetitionPenaltyType repetition_penalty_type,
+                                 const int                   min_length,
+                                 cudaStream_t                stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/beam_search_topk_kernels.cu
+++ b/src/fastertransformer/kernels/beam_search_topk_kernels.cu
--- a/src/fastertransformer/kernels/beam_search_topk_kernels.h
+++ b/src/fastertransformer/kernels/beam_search_topk_kernels.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+
+#pragma once
+
+namespace fastertransformer {
+
+// In original beam search implementation, if a beam is finished, we set it as finished
+// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
+//
+// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
+// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
+// next step.
+//
+// After we collect `beam_width` beams, we will sort them by their norm_scores.
+struct BeamHypotheses {
+    int*   output_ids_tgt       = nullptr;
+    int*   sequence_lengths_tgt = nullptr;
+    float* cum_log_probs        = nullptr;  // cum_log
+    float* normed_scores        = nullptr;  // cum_log / (length**length_penalty)
+    float* log_probs            = nullptr;  // log probs of each generated token
+    float* min_normed_scores    = nullptr;  // record the min normed scores for each batch
+    int*   num_beams            = nullptr;  // the number of finished beams we collect
+    bool*  is_done              = nullptr;
+
+    // Used to set inputs
+    const int*   output_ids_src;
+    const int*   parent_ids_src;
+    const int*   sequence_lengths_src;
+    const int*   end_ids;
+    const float* log_probs_src;
+
+    // some variables for kernels
+    int   step;
+    int   ite;
+    int   batch_size;
+    int   local_batch_size;
+    int   max_seq_len;
+    float length_penalty;
+
+    bool early_stopping         = true;
+    bool is_return_normed_score = true;  // return normed_cum_log_probs or cum_log_probs
+};
+
+template<typename T>
+void invokeTopkBeamSearch(void*           workspace,
+                          size_t&         workspace_size,
+                          T*              log_probs,
+                          int*            ids,
+                          BeamHypotheses* beam_hyps,
+                          const bool*     finished,
+                          const int*      sequence_lengths,
+                          const int       batch_size,
+                          const int       beam_width,
+                          const int       vocab_size_padded_,
+                          const T         diversity_rate,
+                          const float     length_penalty,
+                          const int*      end_ids,
+                          cudaStream_t    stream);
+
+template<typename T>
+void invokeTileEncoderResults(T*           tiled_encoder_output,
+                              int*         tiled_encoder_sequence_length,
+                              const T*     encoder_output,
+                              const int*   encoder_sequence_length,
+                              const size_t batch_size,
+                              const size_t beam_width,
+                              const size_t mem_max_seq_len,
+                              const size_t d_model,
+                              cudaStream_t stream);
+
+void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
+                                const bool*    finished,
+                                const float*   cum_log_probs,
+                                const int      batch_size,
+                                const int      beam_width,
+                                cudaStream_t   stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
--- a/src/fastertransformer/kernels/bert_preprocess_kernels.h
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#ifdef ENABLE_FP8
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#endif  // ENABLE_FP8
+
+namespace fastertransformer {
+
+void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
+                                        size_t*      h_token_num,
+                                        int*         tmp_mask_offset,
+                                        int*         cu_seqlens,
+                                        const int*   sequence_length,
+                                        const int    batch_size,
+                                        const int    max_seq_len,
+                                        cudaStream_t stream);
+
+inline void invokeGetPaddingOffset(size_t*      h_pinned_token_num,
+                                   size_t*      h_token_num,
+                                   int*         tmp_mask_offset,
+                                   const int*   sequence_length,
+                                   const int    batch_size,
+                                   const int    max_seq_len,
+                                   cudaStream_t stream)
+{
+    invokeGetPaddingOffsetAndCuSeqLens(
+        h_pinned_token_num, h_token_num, tmp_mask_offset, nullptr, sequence_length, batch_size, max_seq_len, stream);
+}
+
+template<typename T>
+void invokeBuildEncoderAttentionMask(
+    T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream);
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               cudaStream_t stream);
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               const int    request_seq_len,
+                               cudaStream_t stream);
+
+template<typename T>
+void invokeRebuildPadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
+
+template<typename T>
+void invokeRemovePadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
+
+template<typename T>
+void invokeBuildRelativeAttentionBias(T*                          relative_attention_bias,
+                                      const T*                    relative_attention_bias_table,
+                                      const int                   head_num,
+                                      const int                   seq_len,
+                                      const int                   num_bucket,
+                                      const bool                  is_bidirectional,
+                                      const int                   max_distance,
+                                      const PositionEmbeddingType position_embedding_type,
+                                      cudaStream_t                stream);
+
+template<typename T_OUT, typename T_IN>
+struct getLastTokenDequantizeParam {
+    T_OUT* const       output;
+    T_IN const* const  input;
+    float const* const input_scale;
+
+    const int    batch_size;
+    const int    max_seq_len;
+    const int    d_model;
+    cudaStream_t stream;
+};
+
+template<typename T_OUT, typename T_IN>
+void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param);
+
+#ifdef ENABLE_FP8
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+struct QuantizeMatrixRebuildPaddingParam {
+    T_OUT*       dst;
+    const T_IN*  src;
+    const int*   padding_offset;
+    const int    token_num;
+    const int    d_model;
+    const float* scale;
+    cudaStream_t stream;
+};
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
+#endif  // ENABLE_FP8
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/kernels/custom_ar_kernels.cu
+++ b/src/fastertransformer/kernels/custom_ar_kernels.cu
--- a/src/fastertransformer/kernels/custom_ar_kernels.h
+++ b/src/fastertransformer/kernels/custom_ar_kernels.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include <iostream>
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+#define CUSTOM_AR_SIZE_THRESHOLD 50331648
+#define MAX_ALL_REDUCE_BLOCKS 24
+#define FLAG(a) ((uint32_t)((a) % 0x146))
+#define RANKS_PER_NODE 8
+#define WARP_SIZE 32
+#define DEFAULT_BLOCK_SIZE 1024
+#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+typedef struct bf168 {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+    __nv_bfloat162 z;
+    __nv_bfloat162 w;
+} bf168;
+#endif
+
+template<typename T>
+struct AllReduceParams {
+    size_t    elts_total;
+    size_t    elts_per_rank;
+    size_t    elts_per_block;
+    size_t    rank_offset;
+    size_t    rank, local_rank, node_id;
+    uint32_t  barrier_flag;
+    uint32_t* peer_barrier_ptrs[RANKS_PER_NODE];
+    T*        peer_comm_buffer_ptrs[RANKS_PER_NODE];
+    T*        local_output_buffer_ptr;
+};
+
+template<typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t stream);
+
+void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+template<typename T, typename KERNEL_PARAMS_TYPE>
+void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    switch (params.hidden_size_per_head) {
+        case 128:
+            mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream)
+{
+    multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t status_ = call;                                                                                    \
+        if (status_ != cudaSuccess) {                                                                                  \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_));              \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The structure of parameters for the masked multihead attention kernel.
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+
+template<typename T>
+struct Multihead_attention_params_base {
+
+    // The output buffer. Dimensions B x D.
+    T* out = nullptr;
+
+    // The input Qs and the associated bias. Dimensions B x D and D, resp.
+    const T *q = nullptr, *q_bias = nullptr;
+    // The input Ks and the associated bias. Dimensions B x D and D, resp.
+    const T *k = nullptr, *k_bias = nullptr;
+    // The input Vs and the associated bias. Dimensions B x D and D, resp.
+    const T *v = nullptr, *v_bias = nullptr;
+
+    // The cache for the Ks. The size must be at least B x L x D.
+    T* k_cache = nullptr;
+    // The cache for the Vs. The size must be at least B x L x D.
+    T* v_cache = nullptr;
+    // The indirections to use for cache when beam sampling.
+    const int* cache_indir = nullptr;
+
+    // scales
+    const float* query_weight_output_scale               = nullptr;
+    const float* attention_qk_scale                      = nullptr;
+    const float* attention_output_weight_input_scale_inv = nullptr;
+
+    // Stride to handle the case when KQV is a single buffer
+    int stride = 0;
+
+    // The batch size.
+    int batch_size = 0;
+    // The beam width
+    int beam_width = 0;
+    // The sequence length.
+    int memory_max_len = 0;
+    // The number of heads (H).
+    int num_heads = 0;
+    // The hidden dimension per head (Dh).
+    int hidden_size_per_head = 0;
+    // The per-head latent space reserved for rotary embeddings.
+    int rotary_embedding_dim = 0;
+    // The maximum length of input sentences.
+    int max_input_length = 0;
+    // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
+    int timestep = 0;
+    // The current timestep of each sentences (support different timestep for different sentences)
+
+    // The 1.f / sqrt(Dh). Computed on the host.
+    float inv_sqrt_dh = 0.0f;
+
+    // Used when we have some input context like gpt
+    const int* total_padding_tokens = nullptr;
+
+    const bool* masked_tokens            = nullptr;
+    const int*  prefix_prompt_lengths    = nullptr;
+    int         max_prefix_prompt_length = 0;
+
+    const T* relative_attention_bias        = nullptr;
+    int      relative_attention_bias_stride = 0;
+    // The slope per head of linear position bias to attention score (H).
+    const T* linear_bias_slopes = nullptr;
+
+    const T*   ia3_key_weights   = nullptr;
+    const T*   ia3_value_weights = nullptr;
+    const int* ia3_tasks         = nullptr;
+
+    const float* qkv_scale_out       = nullptr;
+    const float* attention_out_scale = nullptr;
+    int          int8_mode           = 0;
+};
+
+template<typename T>
+struct Multihead_attention_params: public Multihead_attention_params_base<T> {
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+
+    T**    k_cache_per_sample         = nullptr;
+    T**    v_cache_per_sample         = nullptr;
+    size_t kv_cache_per_sample_offset = 0;
+    bool   k_cache_interleaved        = true;
+};
+
+template<class T>
+using Masked_multihead_attention_params = Multihead_attention_params<T>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
--- a/src/fastertransformer/kernels/decoding_kernels.cu
+++ b/src/fastertransformer/kernels/decoding_kernels.cu
--- a/src/fastertransformer/kernels/decoding_kernels.h
+++ b/src/fastertransformer/kernels/decoding_kernels.h
--- a/src/fastertransformer/kernels/gen_relative_pos_bias.cu
+++ b/src/fastertransformer/kernels/gen_relative_pos_bias.cu