check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
+++ b/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct AttentionINT8Weight: AttentionWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers_int8/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers_int8/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
--- a/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.cu
+++ b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.cu
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+__global__ void update_indir_cache_kernel(int*        tgt_indir_cache,
+                                          const int*  src_indir_cache,
+                                          const int*  beam_ids,
+                                          const bool* finished,
+                                          int         start_step,
+                                          int         batch_dim,
+                                          int         local_batch_size,
+                                          int         beam_width,
+                                          int         max_seq_len,
+                                          int         step)
+{
+    int       time_step = threadIdx.x + blockIdx.x * blockDim.x;
+    int       bb_id     = threadIdx.y + blockIdx.y * blockDim.y;
+    const int batch_id  = bb_id / beam_width;
+    const int beam_id   = bb_id % beam_width;
+
+    if (bb_id >= beam_width * local_batch_size || time_step >= min(step + 1, max_seq_len) || finished[bb_id]) {
+        return;
+    }
+    time_step += start_step;
+    const int time_step_circ = time_step % max_seq_len;
+
+    const int src_beam = beam_ids[batch_id * beam_width + beam_id];
+
+    const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
+    const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
+
+    tgt_indir_cache[tgt_offset] = (time_step == step) ? beam_id : src_indir_cache[src_offset];
+}
+
+void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
+                                       const int*   src_indir_cache,
+                                       const int*   beam_ids,
+                                       const bool*  finished,
+                                       int          batch_dim,
+                                       int          local_batch_size,
+                                       int          beam_width,
+                                       int          max_seq_len,
+                                       int          step,
+                                       cudaStream_t stream)
+{
+    const dim3 block(32);
+    const int  start_step = max(0, step + 1 - max_seq_len);
+    const int  num_steps  = min(step + 1, max_seq_len);
+    // Update indirections steps [start_step, step], included
+    const dim3 grid((num_steps + block.x - 1) / block.x, local_batch_size * beam_width);
+    update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
+                                                          src_indir_cache,
+                                                          beam_ids,
+                                                          finished,
+                                                          start_step,
+                                                          batch_dim,
+                                                          local_batch_size,
+                                                          beam_width,
+                                                          max_seq_len,
+                                                          step);
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::BaseBeamSearchLayer(size_t           max_batch_size,
+                                            size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           beam_width,
+                                            size_t           vocab_size,
+                                            size_t           vocab_size_padded,
+                                            int              end_id,
+                                            float            diversity_rate,
+                                            float            temperature,
+                                            float            len_penalty,
+                                            float            repetition_penalty,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward):
+    DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded)
+{
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer):
+    DynamicDecodeBaseLayer(beam_search_layer),
+    vocab_size_(beam_search_layer.vocab_size_),
+    vocab_size_padded_(beam_search_layer.vocab_size_padded_),
+    topk_softmax_workspace_size_(beam_search_layer.topk_softmax_workspace_size_)
+{
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    freeBuffer();
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&topk_softmax_workspace_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // do nothing.
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
+                                                              {"embedding_bias", input_tensors->at(1)},
+                                                              {"step", input_tensors->at(2)},
+                                                              {"src_cache_indirection", input_tensors->at(4)},
+                                                              {"max_input_length", input_tensors->at(5)},
+                                                              {"input_lengths", input_tensors->at(6)},
+                                                              {"ite", input_tensors->at(7)}};
+
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
+                                                               {"finished", output_tensors->at(1)},
+                                                               {"cum_log_probs", output_tensors->at(2)},
+                                                               {"parent_ids", output_tensors->at(3)},
+                                                               {"sequence_length", output_tensors->at(4)},
+                                                               {"tgt_cache_indirection", output_tensors->at(5)}};
+    forward(&output_tensors_map, &input_tensors_map);
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                     const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      end_id [local_batch_size]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width], optional
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+    //      presence_penalty [1] on cpu, optional
+    //          Only one of repetition and presence penalties is allowed.
+    //      min_length [1] on cpu, int, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width], optional
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width], optional
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size, beam_width], optional
+    //      beam_hyps, optional
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 5);
+    const int batch_size = output_tensors->at("output_ids").shape[1];
+    const int beam_width = output_tensors->at("output_ids").shape[2];
+    allocateBuffer(batch_size, beam_width);
+
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+
+    const float temperature    = input_tensors->getVal<float>("temperature", 1.0f);
+    const T*    embedding_bias = input_tensors->getPtr<const T>("embedding_bias", nullptr);
+
+    RepetitionPenaltyType repetition_penalty_type = RepetitionPenaltyType::None;
+    float                 repetition_penalty      = getDefaultPenaltyValue(repetition_penalty_type);
+    if (input_tensors->isExist("repetition_penalty") || input_tensors->isExist("presence_penalty")) {
+        FT_CHECK_WITH_INFO(
+            !(input_tensors->isExist("repetition_penalty") && input_tensors->isExist("presence_penalty")),
+            "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
+            "Please provide one of repetition_penalty or presence_penalty.");
+        repetition_penalty_type = input_tensors->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
+                                                                                 RepetitionPenaltyType::Additive;
+        repetition_penalty      = repetition_penalty_type == RepetitionPenaltyType::Multiplicative ?
+                                      input_tensors->getVal<float>("repetition_penalty") :
+                                      input_tensors->getVal<float>("presence_penalty");
+    }
+
+    invokeAddBiasApplyPenalties(
+        step,
+        input_tensors->at("logits").getPtr<T>(),
+        output_tensors->at("output_ids")
+            .getPtrWithOffset<const int>((step - 1) * batch_size * beam_width + ite * local_batch_size * beam_width),
+        output_tensors->getPtr<const int>("output_ids"),
+        output_tensors->getPtr<const int>("parent_ids"),
+        input_tensors->getPtr<const int>("input_lengths", nullptr),
+        output_tensors->getPtr<const int>("sequence_length", nullptr),
+        embedding_bias,
+        ite,
+        input_tensors->getVal<int>("max_input_length"),
+        local_batch_size,
+        batch_size,
+        beam_width,
+        vocab_size_,
+        vocab_size_padded_,
+        input_tensors->getPtr<const int>("end_id", nullptr),
+        temperature,
+        repetition_penalty,
+        repetition_penalty_type,
+        input_tensors->getVal<const int>("min_length", 0),
+        stream_);
+    sync_check_cuda_error();
+
+    invokeSoftMax(output_tensors, input_tensors);
+
+    if (beam_width > 1) {
+        const int max_seq_len = output_tensors->at("output_ids").shape[0];
+
+        update_indir_cache_kernelLauncher(
+            output_tensors->at("tgt_cache_indirection").getPtr<int>(),
+            input_tensors->at("src_cache_indirection").getPtr<const int>(),
+            output_tensors->at("parent_ids")
+                .getPtrWithOffset<const int>(+step * beam_width * batch_size + ite * local_batch_size * beam_width),
+            output_tensors->at("finished").getPtr<const bool>(),
+            batch_size,
+            local_batch_size,
+            beam_width,
+            max_seq_len,
+            step,
+            stream_);
+        sync_check_cuda_error();
+    }
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template class BaseBeamSearchLayer<float>;
+template class BaseBeamSearchLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h
+++ b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
+private:
+    void freeBuffer();
+
+protected:
+    // meta data
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+
+    size_t topk_softmax_workspace_size_;
+    void*  topk_softmax_workspace_ = nullptr;
+
+    virtual void allocateBuffer()                                                   = 0;
+    virtual void allocateBuffer(size_t batch_size, size_t beam_width)               = 0;
+    virtual void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
+
+public:
+    BaseBeamSearchLayer(size_t           max_batch_size,
+                        size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           beam_width,
+                        size_t           vocab_size,
+                        size_t           vocab_size_padded,
+                        int              end_id,
+                        float            diversity_rate,
+                        float            temperature,
+                        float            len_penalty,
+                        float            repetition_penalty,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward);
+
+    BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer);
+
+    ~BaseBeamSearchLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors) override;
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors) override;
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+};
+
+void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
+                                       const int*   src_indir_cache,
+                                       const int*   beam_ids,
+                                       const bool*  finished,
+                                       int          batch_dim,
+                                       int          beam_width,
+                                       int          max_seq_len,
+                                       int          ite,
+                                       cudaStream_t stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.cu
+++ b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.cu
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void logProbAddCumLogProb(float*       log_probs,
+                                     const T*     logits,
+                                     const float* cum_log_probs,
+                                     const int*   end_ids,
+                                     const bool*  finished,
+                                     const int    beam_width,
+                                     const int    n)
+{
+    int  bid    = blockIdx.x;
+    bool finish = finished != nullptr ? finished[bid] : false;
+    int  offset = bid * n;
+
+    float            max_val = -1 * FLT_MAX;
+    __shared__ float s_max_val;
+    __shared__ float s_sum_val;
+
+    if (finish) {
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = (tid == end_ids[bid / beam_width]) ? cum_log_probs[bid] : -FLT_MAX;
+        }
+    }
+    else {
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = (float)(logits[offset + tid]);
+            max_val                 = max(max_val, log_probs[offset + tid]);
+        }
+
+        max_val = blockReduceMax(max_val);
+        if (threadIdx.x == 0) {
+            s_max_val = max_val;
+        }
+        __syncthreads();
+
+        float sum_val = 0.0f;
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = __expf(log_probs[offset + tid] - s_max_val);
+            sum_val += log_probs[offset + tid];
+        }
+
+        sum_val = blockReduceSum(sum_val);
+        if (threadIdx.x == 0) {
+            s_sum_val = sum_val + 1e-6f;
+        }
+        __syncthreads();
+
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = logf(log_probs[offset + tid] / s_sum_val) + cum_log_probs[bid];
+        }
+    }
+}
+
+template<typename T>
+void invokeLogProbAddCumLogProb(float*       log_probs,
+                                const T*     logits,
+                                const float* cum_log_probs,
+                                const int*   end_ids,
+                                const bool*  finished,
+                                const int    m,
+                                const int    beam_width,
+                                const int    n,
+                                cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+    /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
+    logProbAddCumLogProb<<<grid, block, 0, stream>>>(
+        log_probs, logits, cum_log_probs, end_ids, finished, beam_width, n);
+}
+
+template<typename T>
+__global__ void updateStatesKernel(T*             log_probs,
+                                   T*             cum_log_probs,
+                                   float*         output_log_probs,
+                                   bool*          finished,
+                                   int*           parent_ids,
+                                   int*           sequence_length,
+                                   int*           word_ids,
+                                   int*           output_ids,
+                                   BeamHypotheses beam_hyps,
+                                   const int      local_batch_size,
+                                   const int      beam_width,
+                                   const int      vocab_size,
+                                   const int*     end_ids)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+
+        int batch_id           = index / beam_width;
+        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
+
+        int beam_id = (word_ids[index] / vocab_size) % beam_width;
+        int word_id = word_ids[index] % vocab_size;
+
+        if (output_log_probs != nullptr) {
+            // get the cum_log_probs of previous run
+            output_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id]
+                                      - cum_log_probs[batch_id * beam_width + beam_id];
+        }
+        cum_log_probs[index]   = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id];
+        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
+        finished[index]        = word_id == end_ids[batch_id] ? 1 : 0;
+        parent_ids[index]      = beam_id;
+        word_ids[index]        = word_id;
+        output_ids[index]      = word_id;
+
+        if (beam_hyps.num_beams != nullptr) {
+            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
+                for (int i = 0; i < beam_width; i++) {
+                    finished[batch_id * beam_width + i] = true;
+                }
+            }
+        }
+    }
+}
+
+void invokeUpdateStates(float*          log_probs,
+                        float*          cum_log_probs,
+                        float*          output_log_probs,
+                        bool*           finished,
+                        int*            parent_ids,
+                        int*            sequence_length,
+                        int*            word_ids,
+                        int*            output_ids,
+                        BeamHypotheses* beam_hyps,
+                        const int       local_batch_size,
+                        const int       beam_width,
+                        const int       vocab_size,
+                        const int*      end_ids,
+                        cudaStream_t    stream)
+{
+    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    updateStatesKernel<float><<<grid, block, 0, stream>>>(log_probs,
+                                                          cum_log_probs,
+                                                          output_log_probs,
+                                                          finished,
+                                                          parent_ids,
+                                                          sequence_length,
+                                                          word_ids,
+                                                          output_ids,
+                                                          *beam_hyps,
+                                                          local_batch_size,
+                                                          beam_width,
+                                                          vocab_size,
+                                                          end_ids);
+}
+
+template<typename T>
+void BeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size * beam_width], optional
+    //      beam_hyps, optional
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 6);
+
+    const int   batch_size       = output_tensors->at("output_ids").shape[1];
+    const int   beam_width       = output_tensors->at("output_ids").shape[2];
+    const int   step             = input_tensors->at("step").getVal<int>();
+    const int   ite              = input_tensors->at("ite").getVal<int>();
+    const int   local_batch_size = input_tensors->at("logits").shape[0];
+    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
+                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
+                                       0.0f;
+    const float length_penalty =
+        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
+
+    const int id_offset = step * batch_size * beam_width + ite * local_batch_size * beam_width;
+    invokeLogProbAddCumLogProb(float_log_prob_buf_,
+                               input_tensors->at("logits").getPtr<T>(),
+                               output_tensors->at("cum_log_probs").getPtr<float>(),
+                               input_tensors->at("end_id").getPtr<const int>(),
+                               output_tensors->at("finished").getPtr<bool>(),
+                               local_batch_size * beam_width,
+                               beam_width,
+                               vocab_size_padded_,
+                               stream_);
+    sync_check_cuda_error();
+
+    BeamHypotheses beam_hyps;
+    if (output_tensors->isExist("beam_hyps") && diversity_rate == 0.0f) {
+        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
+        beam_hyps.step                 = step;
+        beam_hyps.ite                  = ite;
+        beam_hyps.local_batch_size     = local_batch_size;
+        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
+        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
+        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
+        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
+        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
+        beam_hyps.length_penalty       = length_penalty;
+    }
+
+    invokeTopkBeamSearch<float>(topk_softmax_workspace_,
+                                topk_softmax_workspace_size_,
+                                float_log_prob_buf_,
+                                output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                                &beam_hyps,
+                                output_tensors->at("finished").getPtr<bool>(),
+                                output_tensors->isExist("sequence_length") ?
+                                    output_tensors->at("sequence_length").getPtr<int>() :
+                                    (int*)nullptr,
+                                local_batch_size,
+                                beam_width,
+                                vocab_size_padded_,
+                                diversity_rate,
+                                length_penalty,
+                                input_tensors->at("end_id").getPtr<const int>(),
+                                stream_);
+    sync_check_cuda_error();
+
+    invokeUpdateStates(float_log_prob_buf_,
+                       output_tensors->at("cum_log_probs").getPtr<float>(),
+                       output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
+                       output_tensors->at("finished").getPtr<bool>(),
+                       output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
+                       output_tensors->at("sequence_length").getPtr<int>(),
+                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                       &beam_hyps,
+                       local_batch_size,
+                       beam_width,
+                       vocab_size_padded_,
+                       input_tensors->at("end_id").getPtr<const int>(),
+                       stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void BeamSearchLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    invokeTopkBeamSearch<float>(nullptr,
+                                topk_softmax_workspace_size_,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                batch_size,
+                                beam_width,
+                                vocab_size_padded_,
+                                0.0f,  // diversity rate
+                                0.0f,  // length penalty
+                                nullptr,
+                                stream_);
+    topk_softmax_workspace_ = reinterpret_cast<float*>(allocator_->reMalloc(
+        topk_softmax_workspace_,
+        topk_softmax_workspace_size_ + sizeof(float) * batch_size * beam_width * vocab_size_padded_,
+        false));
+    float_log_prob_buf_     = (float*)((char*)topk_softmax_workspace_ + topk_softmax_workspace_size_);
+    is_allocate_buffer_     = true;
+}
+
+template<typename T>
+BeamSearchLayer<T>::BeamSearchLayer(size_t           max_batch_size,
+                                    size_t           head_num,
+                                    size_t           size_per_head,
+                                    size_t           beam_width,
+                                    size_t           vocab_size,
+                                    size_t           vocab_size_padded,
+                                    int              end_id,
+                                    float            diversity_rate,
+                                    float            temperature,
+                                    float            len_penalty,
+                                    float            repetition_penalty,
+                                    cudaStream_t     stream,
+                                    cublasMMWrapper* cublas_wrapper,
+                                    IAllocator*      allocator,
+                                    bool             is_free_buffer_after_forward):
+    BaseBeamSearchLayer<T>(max_batch_size,
+                           head_num,
+                           size_per_head,
+                           beam_width,
+                           vocab_size,
+                           vocab_size_padded,
+                           end_id,
+                           diversity_rate,
+                           temperature,
+                           len_penalty,
+                           repetition_penalty,
+                           stream,
+                           cublas_wrapper,
+                           allocator,
+                           is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer):
+    BaseBeamSearchLayer<T>(beam_search_layer)
+{
+}
+
+template<typename T>
+BeamSearchLayer<T>::~BeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template class BeamSearchLayer<float>;
+template class BeamSearchLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h
+++ b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include <float.h>
+
+namespace fastertransformer {
+
+template<typename T>
+class BeamSearchLayer: public BaseBeamSearchLayer<T> {
+private:
+    // meta data
+    using BaseBeamSearchLayer<T>::vocab_size_;
+    using BaseBeamSearchLayer<T>::vocab_size_padded_;
+
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t beam_width) override;
+    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    using BaseBeamSearchLayer<T>::stream_;
+    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
+    using BaseBeamSearchLayer<T>::allocator_;
+
+    float* float_log_prob_buf_ = nullptr;
+
+protected:
+public:
+    BeamSearchLayer(size_t           max_batch_size,
+                    size_t           head_num,
+                    size_t           size_per_head,
+                    size_t           beam_width,
+                    size_t           vocab_size,
+                    size_t           vocab_size_padded,
+                    int              end_id,
+                    float            diversity_rate,
+                    float            temperature,
+                    float            len_penalty,
+                    float            repetition_penalty,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward);
+
+    BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer);
+
+    ~BeamSearchLayer();
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
+set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils)
+
+add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
+set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
+
+add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
+set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
--- a/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.cu
+++ b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.cu
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
+
+namespace fastertransformer {
+
+static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
+static const int MAX_K                             = 4;
+
+template<typename T>
+__global__ void update_kernel(bool*          finished,
+                              int*           parent_ids,
+                              int*           sequence_length,
+                              int*           word_ids,
+                              int*           output_ids,
+                              BeamHypotheses beam_hyps,
+                              const int      vocab_size,
+                              const int*     end_ids,
+                              const int      local_batch_size,
+                              const int      beam_width)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+
+        int batch_id           = index / beam_width;
+        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
+
+        int beam_id = (word_ids[index] / vocab_size) % beam_width;
+        int word_id = word_ids[index] % vocab_size;
+
+        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
+        finished[index]        = word_id == end_ids[index / beam_width] ? 1 : 0;
+        parent_ids[index]      = beam_id;
+        word_ids[index]        = word_id;
+        output_ids[index]      = word_id;
+
+        if (beam_hyps.num_beams != nullptr) {
+            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
+                for (int i = 0; i < beam_width; i++) {
+                    finished[batch_id * beam_width + i] = true;
+                }
+            }
+        }
+    }
+}
+
+void invokeUpdate(bool*           finished,
+                  int*            parent_ids,
+                  int*            sequence_length,
+                  int*            word_ids,
+                  int*            output_ids,
+                  BeamHypotheses* beam_hyps,
+                  const int       local_batch_size,
+                  const int       beam_width,
+                  const int       vocab_size_padded,
+                  const int*      end_ids,
+                  cudaStream_t    stream)
+{
+    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    update_kernel<float><<<grid, block, 0, stream>>>(finished,
+                                                     parent_ids,
+                                                     sequence_length,
+                                                     word_ids,
+                                                     output_ids,
+                                                     *beam_hyps,
+                                                     vocab_size_padded,
+                                                     end_ids,
+                                                     local_batch_size,
+                                                     beam_width);
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size, beam_width]
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 6);
+
+    const int   batch_size       = output_tensors->at("output_ids").shape[1];
+    const int   beam_width       = output_tensors->at("output_ids").shape[2];
+    const int   step             = input_tensors->at("step").getVal<int>();
+    const int   ite              = input_tensors->at("ite").getVal<int>();
+    const int   local_batch_size = input_tensors->at("logits").shape[0];
+    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
+                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
+                                       0.0f;
+    const float length_penalty =
+        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
+
+    const int id_offset = step * batch_size * beam_width + local_batch_size * ite * beam_width;
+
+    BeamHypotheses beam_hyps;
+    if (output_tensors->isExist("beam_hyps")) {
+        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
+        beam_hyps.step                 = step;
+        beam_hyps.ite                  = ite;
+        beam_hyps.local_batch_size     = local_batch_size;
+        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
+        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
+        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
+        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
+        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
+        beam_hyps.log_probs_src        = output_tensors->getPtr<float>("output_log_probs", nullptr);
+        beam_hyps.length_penalty       = length_penalty;
+        beam_hyps.end_ids              = input_tensors->at("end_id").getPtr<int>();
+    }
+
+    invokeTopkSoftMax(input_tensors->at("logits").getPtr<T>(),
+                      (const T*)(nullptr),
+                      output_tensors->at("finished").getPtr<bool>(),
+                      output_tensors->at("sequence_length").getPtr<int>(),
+                      output_tensors->at("cum_log_probs").getPtr<float>(),
+                      output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
+                      output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                      topk_softmax_workspace_,
+                      topk_softmax_workspace_size_,
+                      &beam_hyps,
+                      local_batch_size,
+                      beam_width,
+                      vocab_size_padded_,
+                      input_tensors->at("end_id").getPtr<int>(),
+                      diversity_rate,
+                      length_penalty,
+                      stream_);
+    sync_check_cuda_error();
+
+    invokeUpdate(output_tensors->at("finished").getPtr<bool>(),
+                 output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
+                 output_tensors->at("sequence_length").getPtr<int>(),
+                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                 &beam_hyps,
+                 local_batch_size,
+                 beam_width,
+                 vocab_size_padded_,
+                 input_tensors->at("end_id").getPtr<const int>(),
+                 stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // we need to check 2 * beam_width candidates each time
+    // 64 is the max beam width we support now.
+    topk_softmax_workspace_size_ =
+        (size_t)(ceil(batch_size * 64 * (64 * 2) / 4.) * 4 * 2
+                 + ceil(batch_size * (64 * 2) * SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * (MAX_K * 2) + 2) / 4.) * 4);
+
+    topk_softmax_workspace_ = reinterpret_cast<float*>(
+        allocator_->reMalloc(topk_softmax_workspace_, sizeof(float) * topk_softmax_workspace_size_, true));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(size_t           max_batch_size,
+                                                size_t           head_num,
+                                                size_t           size_per_head,
+                                                size_t           beam_width,
+                                                size_t           vocab_size,
+                                                size_t           vocab_size_padded,
+                                                int              end_id,
+                                                float            diversity_rate,
+                                                float            temperature,
+                                                float            len_penalty,
+                                                float            repetition_penalty,
+                                                cudaStream_t     stream,
+                                                cublasMMWrapper* cublas_wrapper,
+                                                IAllocator*      allocator,
+                                                bool             is_free_buffer_after_forward):
+    BaseBeamSearchLayer<T>(max_batch_size,
+                           head_num,
+                           size_per_head,
+                           beam_width,
+                           vocab_size,
+                           vocab_size_padded,
+                           end_id,
+                           diversity_rate,
+                           temperature,
+                           len_penalty,
+                           repetition_penalty,
+                           stream,
+                           cublas_wrapper,
+                           allocator,
+                           is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
+    BaseBeamSearchLayer<T>(beam_search_layer)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template class OnlineBeamSearchLayer<float>;
+template class OnlineBeamSearchLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h
+++ b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
+private:
+    // meta data
+    using BaseBeamSearchLayer<T>::vocab_size_;
+    using BaseBeamSearchLayer<T>::vocab_size_padded_;
+
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t beam_width) override;
+    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    using BaseBeamSearchLayer<T>::stream_;
+    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
+    using BaseBeamSearchLayer<T>::allocator_;
+
+protected:
+public:
+    OnlineBeamSearchLayer(size_t           max_batch_size,
+                          size_t           head_num,
+                          size_t           size_per_head,
+                          size_t           beam_width,
+                          size_t           vocab_size,
+                          size_t           vocab_size_padded,
+                          int              end_id,
+                          float            diversity_rate,
+                          float            temperature,
+                          float            len_penalty,
+                          float            repetition_penalty,
+                          cudaStream_t     stream,
+                          cublasMMWrapper* cublas_wrapper,
+                          IAllocator*      allocator,
+                          bool             is_free_buffer_after_forward);
+
+    OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer);
+
+    ~OnlineBeamSearchLayer();
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
+++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include <algorithm>
+
+namespace fastertransformer {
+
+template<typename T>
+void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    curandstate_buf_ = reinterpret_cast<curandState_t*>(
+        allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
+    random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
+        allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false));
+    temperature_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
+    repetition_penalty_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false));
+    min_lengths_buf_ = reinterpret_cast<int*>(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false));
+    runtime_logits_buf_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(runtime_logits_buf_, sizeof(T) * batch_size * vocab_size_padded_, false));
+    skip_decode_buf_ =
+        reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));
+
+    // host buffers.
+    temperature_        = new float[batch_size];
+    repetition_penalty_ = new float[batch_size];
+    min_lengths_        = new int[batch_size];
+    skip_decode_        = new bool[batch_size];
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&curandstate_buf_));
+        allocator_->free((void**)(&random_seeds_buf_));
+        allocator_->free((void**)(&temperature_buf_));
+        allocator_->free((void**)(&repetition_penalty_buf_));
+        allocator_->free((void**)(&min_lengths_buf_));
+        allocator_->free((void**)(&runtime_logits_buf_));
+        allocator_->free((void**)(&skip_decode_buf_));
+        delete[] temperature_;
+        delete[] repetition_penalty_;
+        delete[] min_lengths_;
+        delete[] skip_decode_;
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+BaseSamplingLayer<T>::BaseSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        size_t             top_k,
+                                        float              top_p,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward,
+                                        cudaDeviceProp*    cuda_device_prop):
+    DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded)
+{
+}
+
+template<typename T>
+BaseSamplingLayer<T>::BaseSamplingLayer(BaseSamplingLayer const& sampling_layer):
+    DynamicDecodeBaseLayer(sampling_layer),
+    vocab_size_(sampling_layer.vocab_size_),
+    vocab_size_padded_(sampling_layer.vocab_size_padded_),
+    sampling_workspace_size_(sampling_layer.sampling_workspace_size_)
+{
+}
+
+template<typename T>
+BaseSamplingLayer<T>::~BaseSamplingLayer()
+{
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // Set up the sampling layer for given runtime arguments.
+    //
+    // runtime_args:
+    //     runtime_top_k [1] or [batch_size] on cpu, optional.
+    //     runtime_top_p [1] or [batch_size] on cpu, optional
+    //     temperature [1] or [batch_size] on cpu, optional
+    //     repetition_penalty [1] or [batch_size] on cpu, optional
+    //     presence_penalty [1] or [batch_size] on cpu, optional,
+    //         repetition_penalty and presence_penalty are mutually exclusive.
+    //     min_length [1] or [batch_size] on cpu, optional
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
+    Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
+
+    // If runtime argument has single random seed, using this random seed to initialize the random table of all
+    // sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds
+    // respectively. If no random seed, initialize the random table of all sentences by 0 directly.
+    if (runtime_args->isExist("random_seed")) {
+        Tensor random_seeds = runtime_args->at("random_seed");
+        FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1
+                               && (random_seeds.size() == 1 || random_seeds.size() == batch_size),
+                           fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s",
+                                  batch_size,
+                                  vec2str(random_seeds.shape).c_str()));
+        if (random_seeds.size() == 1) {
+            invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal<unsigned long long>(), stream_);
+            sync_check_cuda_error();
+        }
+        else {
+            unsigned long long* random_seed_ptr = random_seeds.getPtr<unsigned long long>();
+            cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_);
+            invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_);
+            sync_check_cuda_error();
+        }
+    }
+    else {
+        // Initialize curand states using the default seed 0.
+        invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_);
+    }
+
+    // Setup penalties.
+    const float default_temperature = 1.0f;
+    Tensor      temperature         = runtime_args->isExist("temperature") ?
+                                          runtime_args->at("temperature") :
+                                          Tensor(MEMORY_CPU, TYPE_FP32, {1}, &default_temperature);
+    if (temperature.size() == 1) {
+        float tp = temperature.getVal<float>();
+        deviceFill(temperature_buf_, batch_size, tp, stream_);
+        std::fill_n(temperature_, batch_size, tp);
+    }
+    else {
+        cudaAutoCpy(temperature_buf_, temperature.getPtr<float>(), batch_size, stream_);
+        std::copy_n(temperature.getPtr<float>(), batch_size, temperature_);
+    }
+
+    if (runtime_args->isExist("repetition_penalty") || runtime_args->isExist("presence_penalty")) {
+        FT_CHECK_WITH_INFO(
+            !(runtime_args->isExist("repetition_penalty") && runtime_args->isExist("presence_penalty")),
+            "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
+            "Please provide one of repetition_penalty or presence_penalty.");
+        repetition_penalty_type_ = runtime_args->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
+                                                                                 RepetitionPenaltyType::Additive;
+        Tensor repetition_penalty = repetition_penalty_type_ == RepetitionPenaltyType::Multiplicative ?
+                                        runtime_args->at("repetition_penalty") :
+                                        runtime_args->at("presence_penalty");
+        if (repetition_penalty.size() == 1) {
+            float rp = repetition_penalty.getVal<float>();
+            deviceFill(repetition_penalty_buf_, batch_size, rp, stream_);
+            std::fill_n(repetition_penalty_, batch_size, rp);
+        }
+        else {
+            cudaAutoCpy(repetition_penalty_buf_, repetition_penalty.getPtr<float>(), batch_size, stream_);
+            std::copy_n(repetition_penalty.getPtr<float>(), batch_size, repetition_penalty_);
+        }
+    }
+    else {
+        repetition_penalty_type_ = RepetitionPenaltyType::None;
+    }
+
+    const int default_min_length = 0;
+    Tensor    min_lengths = runtime_args->at("min_length", Tensor(MEMORY_CPU, TYPE_INT32, {1}, &default_min_length));
+    if (min_lengths.size() == 1) {
+        int minlen = min_lengths.getVal<int>();
+        deviceFill(min_lengths_buf_, batch_size, minlen, stream_);
+        std::fill_n(min_lengths_, batch_size, minlen);
+    }
+    else {
+        cudaAutoCpy(min_lengths_buf_, min_lengths.getPtr<int>(), batch_size, stream_);
+        std::copy_n(min_lengths.getPtr<int>(), batch_size, min_lengths_);
+    }
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size]
+    //      ite [1] on cpu
+    //      random_seed [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size]
+    //      sequence_length [local_batch_size]
+    //      cum_log_probs [local_batch_size], must be float*
+
+    FT_CHECK(false);  // TODO deprecated, need to remove
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
+                                                              {"embedding_bias", input_tensors->at(1)},
+                                                              {"step", input_tensors->at(2)},
+                                                              {"max_input_length", input_tensors->at(3)},
+                                                              {"input_lengths", input_tensors->at(4)},
+                                                              {"ite", input_tensors->at(5)}};
+    if (input_tensors->size() == 7) {
+        input_tensors_map.insert({"random_seed", input_tensors->at(6)});
+    }
+
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
+                                                               {"finished", output_tensors->at(1)},
+                                                               {"sequence_length", output_tensors->at(2)},
+                                                               {"cum_log_probs", output_tensors->at(3)}};
+    forward(&output_tensors_map, &input_tensors_map);
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                   const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded], optional
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size], optional
+    //      ite [1] on cpu
+    //      end_id [local_batch_size], optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size], optional
+    //      sequence_length [local_batch_size], optional
+    //      cum_log_probs [batch_size], must be float*, optional
+    //          The cumultative log probability of generated tokens.
+    //      output_log_probs [local_batch_size], must be float*, optional
+    //          The log probs at the current step.
+
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int max_input_length = input_tensors->at("max_input_length").getVal<int>();
+    T*        logits           = input_tensors->at("logits").getPtr<T>();
+
+#define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; }))
+
+    bool* skip_decode = skip_decode_ + ite * local_batch_size;
+    if (ALL_OF(skip_decode, local_batch_size, bool, true)) {
+        // No sample in the current batch to do TopX sampling.
+        return;
+    }
+    skip_any_ = std::any_of(skip_decode, skip_decode + local_batch_size, [](bool b) { return b; });
+    if (skip_any_) {
+        // A TopX Sampling layer directly changes the logit values. In case of skip_any==true,
+        // meaning topk and topp layers will run simultaneously for a batch in the same step.
+        // We copy the logits to an internal buffer, not affecting the other sampling layers.
+        FT_CHECK(input_tensors->at("logits").size() == local_batch_size * vocab_size_padded_);
+        cudaD2Dcpy(runtime_logits_buf_, logits, input_tensors->at("logits").size());
+        logits = runtime_logits_buf_;
+    }
+
+    const T* embedding_bias =
+        input_tensors->isExist("embedding_bias") ? input_tensors->at("embedding_bias").getPtr<T>() : nullptr;
+    if (embedding_bias != nullptr || !ALL_OF(temperature_ + ite * local_batch_size, local_batch_size, float, 1.0f)) {
+        invokeBatchApplyTemperaturePenalty(logits,
+                                           embedding_bias,
+                                           temperature_buf_ + ite * local_batch_size,
+                                           local_batch_size,
+                                           vocab_size_,
+                                           vocab_size_padded_,
+                                           stream_);
+    }
+    sync_check_cuda_error();
+
+    if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) {
+        float default_value = getDefaultPenaltyValue(repetition_penalty_type_);
+        if (!ALL_OF(repetition_penalty_ + ite * local_batch_size, local_batch_size, float, default_value)) {
+            invokeBatchApplyRepetitionPenalty(
+                logits,
+                repetition_penalty_buf_ + ite * local_batch_size,
+                output_tensors->at("output_ids").getPtrWithOffset<int>(ite * local_batch_size),
+                batch_size,
+                local_batch_size,
+                vocab_size_padded_,
+                input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<int>(),
+                max_input_length,
+                step,
+                repetition_penalty_type_,
+                stream_);
+            sync_check_cuda_error();
+        }
+    }
+
+    const int  num_generated_tokens      = step - max_input_length;
+    const int* min_lengths               = min_lengths_ + ite * local_batch_size;
+    const bool invoke_min_length_penalty = std::any_of(
+        min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; });
+    if (invoke_min_length_penalty) {
+        FT_CHECK_WITH_INFO(input_tensors->isExist("end_id"), "Need end_id to apply min length penlaty");
+        invokeMinLengthPenalty(logits,
+                               min_lengths_buf_ + ite * local_batch_size,
+                               input_tensors->getPtr<const int>("end_id"),
+                               output_tensors->getPtr<const int>("sequence_length"),
+                               max_input_length,
+                               local_batch_size,
+                               vocab_size_padded_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+#undef ALL_OF
+
+    runSampling(output_tensors, input_tensors);
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template class BaseSamplingLayer<float>;
+template class BaseSamplingLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h
+++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <curand_kernel.h>
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class BaseSamplingLayer: public DynamicDecodeBaseLayer {
+private:
+    bool isValidBatchSize(size_t batch_size);
+
+protected:
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+
+    size_t              sampling_workspace_size_;
+    void*               sampling_workspace_ = nullptr;
+    curandState_t*      curandstate_buf_    = nullptr;
+    unsigned long long* random_seeds_buf_   = nullptr;
+
+    float* temperature_buf_        = nullptr;
+    float* repetition_penalty_buf_ = nullptr;
+    int*   min_lengths_buf_        = nullptr;
+    bool*  skip_decode_buf_        = nullptr;
+    T*     runtime_logits_buf_     = nullptr;
+
+    float* temperature_        = nullptr;
+    float* repetition_penalty_ = nullptr;
+    int*   min_lengths_        = nullptr;
+    bool*  skip_decode_        = nullptr;
+    bool   skip_any_           = false;
+
+    RepetitionPenaltyType repetition_penalty_type_ = RepetitionPenaltyType::None;
+
+    virtual void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
+
+    virtual void freeBuffer();
+    virtual void allocateBuffer() = 0;
+    virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);
+
+public:
+    curandState_t* curandstate_buf()
+    {
+        return curandstate_buf_;
+    }
+
+    BaseSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      size_t             top_k,
+                      float              top_p,
+                      unsigned long long random_seed,  // TODO(bhsueh) delete
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward,
+                      cudaDeviceProp*    cuda_device_prop);
+
+    BaseSamplingLayer(BaseSamplingLayer const& sampling_layer);
+
+    ~BaseSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors) override;
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors) override;
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/sampling_layers/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
+set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils)
+
+add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
+set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
+
+add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
+set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
--- a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu
+++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <float.h>
+
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<uint TOP_K_MAX>
+__global__ void setup_topk_runtime_args(int    batch_size,
+                                        uint   top_k,
+                                        uint*  top_ks,
+                                        int    top_ks_size,
+                                        float  top_p,
+                                        float* top_ps,
+                                        int    top_ps_size,
+                                        bool*  skip_decode)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
+        uint  k = top_ks_size > 1 ? top_ks[i] : top_k;
+        float p = top_ps_size > 1 ? top_ps[i] : top_p;
+        if (k == 0 && p == 0.0f) {
+            // FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
+            // So, we set the topk = 1 as an alternative solution.
+            k = 1;
+        }
+        if (k > 0 && p == 0.0f) {
+            // for compatibility <= FT5.0.
+            // This case corresponds to the old topk sampling, which is equivalent to
+            // the old topk_topp sampling with topp=1.0f. TopKSamplingLayer and
+            // TopKTopPSamplingLayer are now merged by TopKSamplingLayer. Thus, we
+            // replace the case topk>0 and topp=0.0f by topk>0 and topp=1.0f for the
+            // compatibility.
+            p = 1.0f;
+        }
+        // Clip k value. A topk sampling kernel supports up to TOP_K_MAX=64.
+        top_ks[i] = k > TOP_K_MAX ? TOP_K_MAX : k;
+        if (k > TOP_K_MAX) {
+            printf("[WARNING] topk (%d) is larger than max supported number (%d) for token %d"
+                   " clip to max supported number %d. \n",
+                   k,
+                   TOP_K_MAX,
+                   i,
+                   top_ks[i]);
+        }
+        // Clip p value if it is out of range. range = [0.0, 1.0].
+        top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
+        if (p < 0.0f || p > 1.0f) {
+            printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
+                   " clip to closest number %f.\n",
+                   p,
+                   i,
+                   top_ps[i]);
+        }
+        skip_decode[i] = k == 0;
+    }
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
+    uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1;
+    if (max_top_k == 0) {
+        // for safety. TopKSamplingLayer handles a case of top_k=0 and top_p=0 as
+        // a greedy decode, i.e. top_k=1, although such case has max_top_k=0.
+        max_top_k = 1;
+    }
+    invokeTopKSampling<T>(nullptr,
+                          sampling_workspace_size_,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          max_top_k,
+                          1.0f,
+                          vocab_size_padded_,
+                          nullptr,
+                          stream_,
+                          batch_size,
+                          skip_decode_buf_);
+    sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, false);
+    runtime_top_k_buf_ =
+        reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
+    runtime_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&sampling_workspace_));
+        allocator_->free((void**)(&runtime_top_k_buf_));
+        allocator_->free((void**)(&runtime_top_p_buf_));
+    }
+    BaseSamplingLayer<T>::freeBuffer();
+    is_allocate_buffer_ = false;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // Setup runtime topk and topp arguments.
+    //
+    // runtime_args:
+    //     runtime_top_k [1] or [batch_size] on cpu, optional, uint.
+    //     runtime_top_p [1] or [batch_size] on cpu, optional, float.
+    //     temperature [1] or [batch_size] on cpu, optional
+    //     repetition_penalty [1] or [batch_size] on cpu, optional
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
+
+    uint         tmp_top_k     = 0;
+    const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ?
+                                     runtime_args->at("runtime_top_k") :
+                                     Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
+    const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    const size_t runtime_top_k_size = runtime_top_k.size();
+    const size_t runtime_top_p_size = runtime_top_p.size();
+
+    uint  top_k = runtime_top_k.max<uint>();
+    float top_p = runtime_top_p_size == 0 ? 0.0f : runtime_top_p.getVal<float>();
+
+    if (runtime_top_k_size > 1) {
+        FT_CHECK_WITH_INFO(
+            runtime_top_k.size() == batch_size,
+            fmtstr("runtime_top_k.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_k.size(), batch_size));
+        cudaAutoCpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size, stream_);
+    }
+    if (runtime_top_p_size > 1) {
+        FT_CHECK_WITH_INFO(
+            runtime_top_p.size() == batch_size,
+            fmtstr("runtime_top_p.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_p.size(), batch_size));
+        cudaAutoCpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size, stream_);
+    }
+
+    dim3 block(std::min((int)batch_size, 256));
+    dim3 grid(div_up((int)batch_size, (int)block.x));
+    // support top_k up to 1024.
+    setup_topk_runtime_args<1024><<<grid, block, 0, stream_>>>(batch_size,
+                                                               top_k,
+                                                               runtime_top_k_buf_,
+                                                               runtime_top_k_size,
+                                                               top_p,
+                                                               runtime_top_p_buf_,
+                                                               runtime_top_p_size,
+                                                               skip_decode_buf_);
+    cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
+    uint* runtime_top_ks = new uint[batch_size];
+    cudaAutoCpy(runtime_top_ks, runtime_top_k_buf_, batch_size, stream_);
+    runtime_max_top_k_ = static_cast<int>(*std::max_element(runtime_top_ks, runtime_top_ks + batch_size));
+    delete[] runtime_top_ks;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded], optional
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size], optional
+    //      ite [1] on cpu
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size], optional
+    //      sequence_length [local_batch_size], optional
+    //      cum_log_probs [batch_size], must be float*, optional
+    //          The cumultative log probability of generated tokens.
+    //      output_log_probs [local_batch_size], must be float*, optional
+    //          The log probs at the current step.
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int step             = input_tensors->at("step").getVal<int>();
+
+    // in case of skip any, the logit value is already copied and processed.
+    T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
+
+    invokeAddBiasEndMask(logits,
+                         (T*)(nullptr),
+                         input_tensors->at("end_id").getPtr<const int>(),
+                         output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+                         local_batch_size,
+                         vocab_size_,
+                         vocab_size_padded_,
+                         stream_);
+    sync_check_cuda_error();
+
+    float* cum_log_probs =
+        output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
+    float* output_log_probs =
+        output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
+
+    if (cum_log_probs != nullptr || output_log_probs != nullptr) {
+        invokeAddBiasSoftMax(
+            logits,
+            (T*)(nullptr),
+            input_tensors->at("end_id").getPtr<int>(),
+            output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+            local_batch_size,
+            vocab_size_padded_,
+            vocab_size_,
+            stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeBatchTopKSampling(
+        sampling_workspace_,
+        sampling_workspace_size_,
+        logits,
+        output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
+        output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
+        output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+        cum_log_probs,
+        output_log_probs,
+        curandstate_buf_ + ite * local_batch_size,
+        (int)runtime_max_top_k_,  // useless because runtime_top_k_buf_ is never nullptr. Keep for legacy.
+        (int*)(runtime_top_k_buf_ + ite * local_batch_size),
+        1.0f,  // useless because runtime_top_p_buf_ is never nullptr. Keep for legacy.
+        runtime_top_p_buf_ + ite * local_batch_size,
+        vocab_size_padded_,
+        input_tensors->at("end_id").getPtr<int>(),
+        stream_,
+        local_batch_size,
+        skip_decode_buf_ + ite * local_batch_size);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+TopKSamplingLayer<T>::TopKSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        size_t             top_k,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward):
+    BaseSamplingLayer<T>(max_batch_size,
+                         vocab_size,
+                         vocab_size_padded,
+                         end_id,
+                         top_k,
+                         0.0f,
+                         random_seed,
+                         temperature,
+                         len_penalty,
+                         repetition_penalty,
+                         stream,
+                         cublas_wrapper,
+                         allocator,
+                         is_free_buffer_after_forward,
+                         nullptr)
+{
+}
+
+template<typename T>
+TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer):
+    BaseSamplingLayer<T>(top_k_sampling_layer)
+{
+}
+
+template<typename T>
+TopKSamplingLayer<T>::~TopKSamplingLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    freeBuffer();
+}
+
+template class TopKSamplingLayer<float>;
+template class TopKSamplingLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h
+++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class TopKSamplingLayer: public BaseSamplingLayer<T> {
+private:
+    void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    void freeBuffer() override;
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
+
+    uint   runtime_max_top_k_ = 1;
+    uint*  runtime_top_k_buf_ = nullptr;
+    float* runtime_top_p_buf_ = nullptr;
+    using BaseSamplingLayer<T>::vocab_size_;
+    using BaseSamplingLayer<T>::vocab_size_padded_;
+
+    using BaseSamplingLayer<T>::sampling_workspace_size_;
+    using BaseSamplingLayer<T>::sampling_workspace_;
+    using BaseSamplingLayer<T>::curandstate_buf_;
+    using BaseSamplingLayer<T>::random_seeds_buf_;
+    using BaseSamplingLayer<T>::skip_decode_buf_;
+    using BaseSamplingLayer<T>::skip_decode_;
+    using BaseSamplingLayer<T>::skip_any_;
+    using BaseSamplingLayer<T>::runtime_logits_buf_;
+
+    using BaseSamplingLayer<T>::stream_;
+    using BaseSamplingLayer<T>::allocator_;
+    using BaseSamplingLayer<T>::is_allocate_buffer_;
+
+protected:
+public:
+    TopKSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      size_t             top_k,
+                      unsigned long long random_seed,
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward);
+    TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer);
+    ~TopKSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu
+++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <float.h>
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+static __global__ void set_topp_runtime_args(int             batch_size,
+                                             uint            top_k,
+                                             uint*           top_ks,
+                                             int             top_ks_size,
+                                             float           top_p,
+                                             float*          top_ps,
+                                             int             top_ps_size,
+                                             bool*           skip_decode,
+                                             float*          initial_top_p_buf,
+                                             float*          top_p_decay_buf,
+                                             const float*    top_p_decay,
+                                             float*          top_p_min_buf,
+                                             const float*    top_p_min,
+                                             int32_t*        top_p_reset_ids_buf,
+                                             const uint32_t* top_p_reset_ids)
+{
+    /**
+     * @brief Setup the runtime arguments for topp, broadcasting top_p to top_ps
+                and top_k to top_ks, copying top_p_decay/top_p_min/top_p_reset_ids
+                to internal buffers.
+     *
+     * \param batch_size            [batch_size]
+     * \param op_k                  [batch_size]
+     * \param top_ks                [batch_size]
+     * \param top_ks_size           [batch_size]
+     * \param top_p                 [batch_size]
+     * \param top_ps                [batch_size]
+     * \param top_ps_size           [batch_size]
+     * \param skip_decode           [batch_size]
+     * \param initial_top_p_buf     [batch_size]
+     * \param top_p_decay_buf       [batch_size]
+     * \param top_p_decay           [batch_size], optional, must between [0, 1]
+     * \param top_p_min_buf         [batch_size]
+     * \param top_p_min             [batch_size], optional, must between [0, 1]
+     * \param top_p_reset_ids_buf    [batch_size]
+     * \param top_p_reset_ids        [batch_size], optional
+     *
+     */
+
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
+        uint  k = top_ks_size > 1 ? top_ks[i] : top_k;
+        float p = top_ps_size > 1 ? top_ps[i] : top_p;
+        if (k == 0 && p == 0.0f) {
+            // FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
+            // So, we set the topk = 1 as an alternative solution.
+            k = 1;
+        }
+        top_ks[i] = k;
+        // Clip p value if it is out of range. range = [0.0, 1.0].
+        top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
+        if (p < 0.0f || p > 1.0f) {
+            printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
+                   " clip to closest number %f.\n",
+                   p,
+                   i,
+                   top_ps[i]);
+        }
+        skip_decode[i] = k > 0;
+
+        initial_top_p_buf[i] = top_ps[i];
+        top_p_decay_buf[i]   = top_p_decay == nullptr ? 1.0f : top_p_decay[i];
+        if (top_p_decay_buf[i] > 1.0f || top_p_decay_buf[i] <= 0.0f) {
+            printf("[WARNING] top_p_decay_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
+                   " change to 1.0f.\n",
+                   top_p_decay_buf[i],
+                   i);
+            top_p_decay_buf[i] = 1.0f;
+        }
+        top_p_min_buf[i] = top_p_min == nullptr ? 1e-6f : top_p_min[i];  // prevent topp becoming 0.0
+        if (top_p_min_buf[i] > 1.0f || top_p_min_buf[i] <= 0.0f) {
+            printf("[WARNING] top_p_min_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
+                   " change to 0.5f.\n",
+                   top_p_min_buf[i],
+                   i);
+            top_p_min_buf[i] = 0.5f;
+        }
+        top_p_reset_ids_buf[i] = (int32_t)(top_p_reset_ids == nullptr ? -1 : top_p_reset_ids[i]);
+    }
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
+    invokeTopPSampling<T>(nullptr,  // workspace
+                          sampling_workspace_size_,
+                          cub_temp_storage_size_,
+                          nullptr,  // output_ids
+                          nullptr,  // sequence_length
+                          nullptr,  // finished_buffer
+                          nullptr,  // cum_log_probs
+                          nullptr,  // output_log_probs
+                          nullptr,  // log_probs
+                          topp_id_vals_buf_,
+                          topp_offset_buf_,
+                          begin_topp_offset_buf_,
+                          curandstate_buf_,
+                          batch_size,
+                          vocab_size_padded_,
+                          nullptr,
+                          top_p.size() > 0 ? top_p.max<float>() : 0.0f,
+                          stream_,
+                          cuda_device_prop_,
+                          skip_decode_buf_);
+    sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, true);
+    runtime_top_k_buf_ =
+        reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
+    runtime_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
+    initial_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(initial_top_p_buf_, sizeof(float) * batch_size, false));
+    top_p_decay_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(top_p_decay_buf_, sizeof(float) * batch_size, false));
+    top_p_min_buf_ = reinterpret_cast<float*>(allocator_->reMalloc(top_p_min_buf_, sizeof(float) * batch_size, false));
+    top_p_reset_ids_buf_ =
+        reinterpret_cast<int32_t*>(allocator_->reMalloc(top_p_reset_ids_buf_, sizeof(int32_t) * batch_size, false));
+    topp_id_vals_buf_ = reinterpret_cast<int*>(
+        allocator_->reMalloc(topp_id_vals_buf_, sizeof(int) * batch_size * vocab_size_padded_, false));
+    topp_offset_buf_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
+    begin_topp_offset_buf_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(begin_topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&sampling_workspace_));
+        allocator_->free((void**)(&topp_id_vals_buf_));
+        allocator_->free((void**)(&topp_offset_buf_));
+        allocator_->free((void**)(&begin_topp_offset_buf_));
+        allocator_->free((void**)(&runtime_top_k_buf_));
+        allocator_->free((void**)(&runtime_top_p_buf_));
+        allocator_->free((void**)(&initial_top_p_buf_));
+        allocator_->free((void**)(&top_p_decay_buf_));
+        allocator_->free((void**)(&top_p_min_buf_));
+        allocator_->free((void**)(&top_p_reset_ids_buf_));
+    }
+    BaseSamplingLayer<T>::freeBuffer();
+    is_allocate_buffer_ = false;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    /**
+    * @brief Set up the sampling layer for given runtime arguments.
+
+    * runtime_args:
+    *   \param  runtime_top_k [1] or [batch_size] on cpu, optional.
+    *   \param  runtime_top_p [1] or [batch_size] on cpu, optional
+    *   \param  temperature [1] or [batch_size] on cpu, optional
+    *   \param  repetition_penalty [1] or [batch_size] on cpu, optional
+    *   \param  top_p_decay [batch_size] on gpu, float, optional
+    *   \param  top_p_min [batch_size] on gpu, float, optional
+    *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+    **/
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
+    const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    const size_t runtime_top_p_size = runtime_top_p.size();
+    if (runtime_top_p_size == 0) {
+        std::fill_n(skip_decode_, batch_size, true);
+        return;
+    }
+
+    uint         tmp_top_k          = 0;
+    const Tensor runtime_top_k      = runtime_args->isExist("runtime_top_k") ?
+                                          runtime_args->at("runtime_top_k") :
+                                          Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
+    const size_t runtime_top_k_size = runtime_top_k.size();
+
+    uint  top_k = runtime_top_k.getVal<uint>();
+    float top_p = runtime_top_p.getVal<float>();
+
+    if (runtime_top_k_size > 1) {
+        FT_CHECK(runtime_top_k.size() == batch_size);
+        cudaH2Dcpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size);
+    }
+    if (runtime_top_p_size > 1) {
+        FT_CHECK(runtime_top_p.size() == batch_size);
+        cudaH2Dcpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size);
+    }
+
+    dim3 block(std::min((int)batch_size, 256));
+    dim3 grid(div_up((int)batch_size, (int)block.x));
+
+    const float*    top_p_decay     = runtime_args->getPtr<float>("top_p_decay", nullptr);
+    const float*    top_p_min       = runtime_args->getPtr<float>("top_p_min", nullptr);
+    const uint32_t* top_p_reset_ids = runtime_args->getPtr<uint32_t>("top_p_reset_ids", nullptr);
+    set_topp_runtime_args<<<grid, block, 0, stream_>>>(batch_size,
+                                                       top_k,
+                                                       runtime_top_k_buf_,
+                                                       runtime_top_k_size,
+                                                       top_p,
+                                                       runtime_top_p_buf_,
+                                                       runtime_top_p_size,
+                                                       skip_decode_buf_,
+                                                       initial_top_p_buf_,
+                                                       top_p_decay_buf_,
+                                                       top_p_decay,
+                                                       top_p_min_buf_,
+                                                       top_p_min,
+                                                       top_p_reset_ids_buf_,
+                                                       top_p_reset_ids);
+    sync_check_cuda_error();
+    cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
+    float* runtime_top_ps = new float[batch_size];
+    cudaAutoCpy(runtime_top_ps, runtime_top_p_buf_, batch_size, stream_);
+    runtime_max_top_p_ = *std::max_element(runtime_top_ps, runtime_top_ps + batch_size);
+    delete[] runtime_top_ps;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    /**
+    * input_tensors:
+    *   \param  logits [local_batch_size, vocab_size_padded]
+    *   \param  embedding_bias [vocab_size_padded], optional
+    *   \param  step [1] on cpu
+    *   \param  max_input_length [1] on cpu
+    *   \param  input_lengths [local_batch_size], optional
+    *   \param  ite [1] on cpu
+
+    * output_tensors:
+    *   \param  output_ids [max_seq_len, batch_size]
+    *   \param  finished [local_batch_size], optional
+    *   \param  sequence_length [local_batch_size], optional
+    *   \param  cum_log_probs [batch_size], must be float*, optional
+    *   \param  The cumultative log probability of generated tokens.
+    *   \param  output_log_probs [local_batch_size], must be float*, optional
+                    log probs at the current step.
+    **/
+
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+
+    // in case of skip any, the logit value is already copied and processed.
+    T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
+
+    invokeTopPInitialize(
+        topp_id_vals_buf_, topp_offset_buf_, begin_topp_offset_buf_, local_batch_size, vocab_size_padded_, stream_);
+    sync_check_cuda_error();
+
+    invokeAddBiasSoftMax(logits,
+                         (T*)(nullptr),
+                         input_tensors->at("end_id").getPtr<int>(),
+                         output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+                         local_batch_size,
+                         vocab_size_padded_,
+                         vocab_size_,
+                         stream_);
+    sync_check_cuda_error();
+
+    float* cum_log_probs =
+        output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
+    float* output_log_probs =
+        output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
+
+    invokeBatchTopPSampling<T>(
+        sampling_workspace_,
+        sampling_workspace_size_,
+        cub_temp_storage_size_,
+        output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
+        output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
+        output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+        cum_log_probs,
+        output_log_probs,
+        logits,
+        topp_id_vals_buf_,
+        topp_offset_buf_,
+        begin_topp_offset_buf_,
+        curandstate_buf_ + ite * local_batch_size,
+        local_batch_size,
+        vocab_size_padded_,
+        input_tensors->at("end_id").getPtr<int>(),
+        runtime_max_top_p_,
+        runtime_top_p_buf_ + ite * local_batch_size,
+        stream_,
+        cuda_device_prop_,
+        skip_decode_buf_ + ite * local_batch_size);
+    sync_check_cuda_error();
+
+    invokeComputeToppDecay(
+        runtime_top_p_buf_ + ite * local_batch_size,
+        initial_top_p_buf_ + ite * local_batch_size,
+        output_tensors->getPtrWithOffset<int>("output_ids", step * batch_size + ite * local_batch_size),
+        top_p_decay_buf_ + ite * local_batch_size,
+        top_p_min_buf_ + ite * local_batch_size,
+        top_p_reset_ids_buf_ + ite * local_batch_size,
+        local_batch_size,
+        stream_);
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+TopPSamplingLayer<T>::TopPSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        float              top_p,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward,
+                                        cudaDeviceProp*    cuda_device_prop):
+    BaseSamplingLayer<T>(max_batch_size,
+                         vocab_size,
+                         vocab_size_padded,
+                         end_id,
+                         0,
+                         top_p,
+                         random_seed,
+                         temperature,
+                         len_penalty,
+                         repetition_penalty,
+                         stream,
+                         cublas_wrapper,
+                         allocator,
+                         is_free_buffer_after_forward,
+                         cuda_device_prop)
+{
+}
+
+template<typename T>
+TopPSamplingLayer<T>::TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer):
+    BaseSamplingLayer<T>(top_p_sampling_layer)
+{
+}
+
+template<typename T>
+TopPSamplingLayer<T>::~TopPSamplingLayer()
+{
+    freeBuffer();
+}
+
+template class TopPSamplingLayer<float>;
+template class TopPSamplingLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h
+++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class TopPSamplingLayer: public BaseSamplingLayer<T> {
+private:
+    void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
+    void freeBuffer() override;
+
+    uint*    runtime_top_k_buf_ = nullptr;
+    float*   runtime_top_p_buf_ = nullptr;
+    float    runtime_max_top_p_;
+    float*   initial_top_p_buf_   = nullptr;
+    float*   top_p_decay_buf_     = nullptr;
+    float*   top_p_min_buf_       = nullptr;
+    int32_t* top_p_reset_ids_buf_ = nullptr;
+
+    int*   topp_id_vals_buf_      = nullptr;
+    int*   topp_offset_buf_       = nullptr;
+    int*   begin_topp_offset_buf_ = nullptr;
+    size_t cub_temp_storage_size_;
+
+    using BaseSamplingLayer<T>::vocab_size_;
+    using BaseSamplingLayer<T>::vocab_size_padded_;
+
+    using BaseSamplingLayer<T>::sampling_workspace_size_;
+    using BaseSamplingLayer<T>::sampling_workspace_;
+    using BaseSamplingLayer<T>::curandstate_buf_;
+    using BaseSamplingLayer<T>::random_seeds_buf_;
+    using BaseSamplingLayer<T>::skip_decode_buf_;
+    using BaseSamplingLayer<T>::skip_decode_;
+    using BaseSamplingLayer<T>::skip_any_;
+    using BaseSamplingLayer<T>::runtime_logits_buf_;
+
+    using BaseSamplingLayer<T>::stream_;
+    using BaseSamplingLayer<T>::allocator_;
+    using BaseSamplingLayer<T>::is_allocate_buffer_;
+    using BaseSamplingLayer<T>::cuda_device_prop_;
+
+protected:
+public:
+    TopPSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      float              top_p,
+                      unsigned long long random_seed,
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward,
+                      cudaDeviceProp*    cuda_device_prop);
+    TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer);
+    ~TopPSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/BaseWeight.h
+++ b/src/fastertransformer/models/BaseWeight.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#pragma once
+
+namespace fastertransformer {
+
+template<typename T>
+struct FtWeight {
+
+public:
+    std::string         name_;
+    std::vector<size_t> shape_;
+    size_t              size_ = 0;
+    T*                  ptr_  = nullptr;
+
+    FtWeight() {}
+    FtWeight(const std::string name, const std::vector<size_t> shape, T* ptr): name_(name), shape_(shape), ptr_(ptr)
+    {
+        size_ = 1;
+        for (uint i = 0; i < shape_.size(); i++) {
+            size_ *= shape_[i];
+        }
+    }
+
+    ~FtWeight()
+    {
+        size_ = 0;
+        ptr_  = nullptr;
+    }
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/CMakeLists.txt
+++ b/src/fastertransformer/models/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(llama)
--- a/src/fastertransformer/models/llama/Barrier.h
+++ b/src/fastertransformer/models/llama/Barrier.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/utils/logger.h"
+#include <pthread.h>
+
+namespace fastertransformer {
+
+class Barrier {
+public:
+    Barrier(unsigned count)
+    {
+        FT_LOG_INFO("Barrier(%d)", (int)count);
+        pthread_barrier_init(&barrier_, nullptr, count);
+    }
+
+    Barrier(const Barrier&) = delete;
+    Barrier& operator=(const Barrier&) = delete;
+    Barrier(Barrier&&) noexcept        = delete;
+    Barrier& operator=(Barrier&&) noexcept = delete;
+
+    void wait()
+    {
+        pthread_barrier_wait(&barrier_);
+    }
+
+    ~Barrier()
+    {
+        pthread_barrier_destroy(&barrier_);
+    }
+
+private:
+    pthread_barrier_t barrier_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(fused_multi_head_attention)
+
+add_library(Llama STATIC 
+        LlamaV2.cc
+        LlamaBatch.cc
+        LlamaCacheManager.cc
+        LlamaContextDecoder.cc
+        LlamaContextAttentionLayer.cc
+        LlamaDecoderSelfAttentionLayer.cc
+        LlamaDecoder.cc
+        LlamaWeight.cc
+        LlamaDecoderLayerWeight.cc
+        LlamaFfnLayer.cc
+        llama_kernels.cu
+        llama_decoder_kernels.cu
+        llama_utils.cu)
+set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(Llama PUBLIC -lcudart
+        cublasMMWrapper
+        DynamicDecodeLayer
+        BaseBeamSearchLayer
+        activation_kernels
+        decoder_masked_multihead_attention
+        bert_preprocess_kernels
+        decoding_kernels
+        unfused_attention_kernels
+        custom_ar_kernels
+        custom_ar_comm
+        gpt_kernels
+        tensor
+        memory_utils
+        nccl_utils
+        cuda_utils
+        logger
+        llama_fmha)
\ No newline at end of file