[Fix] Remove unused code to reduce binary size (#181)

* clean-up * fix lint * fix lint

[Fix] Remove unused code to reduce binary size (#181)
* clean-up * fix lint * fix lint
981a4610 · Li Zhang · GitHub · 83697422 · 83697422 · 83697422
Unverified Commit 981a4610 authored Jul 31, 2023 by Li Zhang Committed by GitHub Jul 31, 2023
20 changed files
--- a/src/turbomind/layers/FfnFP8Layer.cc
+++ b/src/turbomind/layers/FfnFP8Layer.cc
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/turbomind/layers/FfnFP8Layer.h"
-#include "src/turbomind/kernels/activation_fp8_kernels.h"
-#include "src/turbomind/utils/cublasFP8MMWrapper.h"
-#include "src/turbomind/utils/nvtx_utils.h"
-namespace turbomind {
-template<typename T1, typename T2>
-void FfnFP8Layer<T1, T2>::forward(TensorMap*                  output_tensors,
-                                  TensorMap*                  input_tensors,
-                                  const FfnFP8Weight<T1, T2>* ffn_weights)
-{
-    // input tensors:
-    //      input_hidden_state [token_num, d_model],
-    // output tensors:
-    //      output_hidden_state [token_num, d_model],
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    FT_CHECK(input_tensors->size() == 1);
-    FT_CHECK(output_tensors->size() == 1);
-    const int m                  = input_tensors->at("input_hidden_state").shape[0];
-    const int d_model            = input_tensors->at("input_hidden_state").shape[1];
-    const T1* input_hidden_state = input_tensors->at("input_hidden_state").getPtr<T1>();
-    Tensor    output_tensor      = output_tensors->at("output_hidden_state");
-    allocateBuffer(m);
-#ifdef FUSE_GEMM_ACT
-    if (fp8_mode_ == 1) {
-        const float alpha = 1.0f;
-        const float beta  = 0.0f;
-        reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-            ->Gemm(inter_buf_bf16_,
-                   (int)1,
-                   (int)m,
-                   (int)inter_size_,
-                   (int)d_model,
-                   (int64_t)0,
-                   (int64_t)0,
-                   (int64_t)0,
-                   &alpha,
-                   &beta,
-                   input_hidden_state,
-                   ffn_weights->intermediate_weight.kernel,
-                   ffn_weights->intermediate_weight.input_scale,
-                   ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
-                   stream_);
-        invokeAddBiasActivation(m,
-                                ffn_weights->intermediate_weight.bias,
-                                ffn_weights->intermediate_weight.output_scale,
-                                ffn_weights->intermediate_weight.scale,
-                                ffn_weights->intermediate_weight.per_channel_scale_min,
-                                ffn_weights->output_weight.input_scale_inv);
-    }
-    else if (fp8_mode_ == 2) {
-#ifdef USE_QGMMA
-        if (getActivationType() == ActivationType::Gelu) {
-            PUSH_RANGE("FFN gemm 1 bias gelu");
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                ->Conv1x1Gemm<false, true>(inter_buf_,
-                                           m,
-                                           inter_size_,
-                                           d_model,
-                                           input_hidden_state,
-                                           ffn_weights->intermediate_weight.kernel,
-                                           ffn_weights->intermediate_weight.bias,
-                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
-                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
-                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
-                                           stream_);
-            POP_RANGE;
-        }
-        else if (getActivationType() == ActivationType::Relu) {
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                ->Conv1x1Gemm<true, false>(inter_buf_,
-                                           m,
-                                           inter_size_,
-                                           d_model,
-                                           input_hidden_state,
-                                           ffn_weights->intermediate_weight.kernel,
-                                           ffn_weights->intermediate_weight.bias,
-                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
-                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
-                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
-                                           stream_);
-        }
-#else  // USE_QGMMA
-        const float alpha = 1.0f;
-        const float beta  = 0.0f;
-        if (getActivationType() == ActivationType::Gelu) {
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-                ->Gemm_Bias_Act<false, true>(inter_buf_bf16_,
-#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
-                ->Gemm_Bias_Act<false, true>(inter_buf_,
-#endif  // FP8_GEMM_OUTPUT_QUANT_DISABLE
-                                             (int)1,
-                                             (int)m,
-                                             (int)inter_size_,
-                                             (int)d_model,
-                                             (int64_t)0,
-                                             (int64_t)0,
-                                             (int64_t)0,
-                                             &alpha,
-                                             &beta,
-                                             input_hidden_state,
-                                             ffn_weights->intermediate_weight.kernel,
-                                             ffn_weights->intermediate_weight.input_scale,
-                                             ffn_weights->intermediate_weight.weight_scale,
-                                             ffn_weights->intermediate_weight.bias,
-                                             ffn_weights->intermediate_weight.output_scale,
-                                             stream_);
-        }
-        else if (getActivationType() == ActivationType::Relu) {
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-                ->Gemm_Bias_Act<true, false>(inter_buf_bf16_,
-#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
-                ->Gemm_Bias_Act<true, false>(inter_buf_,
-#endif  // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-                                             (int)1,
-                                             (int)m,
-                                             (int)inter_size_,
-                                             (int)d_model,
-                                             (int64_t)0,
-                                             (int64_t)0,
-                                             (int64_t)0,
-                                             &alpha,
-                                             &beta,
-                                             input_hidden_state,
-                                             ffn_weights->intermediate_weight.kernel,
-                                             ffn_weights->intermediate_weight.input_scale,
-                                             ffn_weights->intermediate_weight.weight_scale,
-                                             ffn_weights->intermediate_weight.bias,
-                                             ffn_weights->intermediate_weight.output_scale,
-                                             stream_);
-        }
-#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-        invokeQuantizeMatrix<T1, T2, QUANTIZE_MODE::PER_TENSOR>(
-            inter_buf_, ffn_weights->output_weight.input_scale_inv, inter_buf_bf16_, m * inter_size_, 1, stream_);
-#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
-#endif  // USE_QGMMA
-    }
-#else  // FUSE_GEMM_ACT
-    PUSH_RANGE("FFN gemm 1");
-#ifdef SPARSITY_ENABLED
-    int m_tmp = m;
-    if (m_tmp % 8 != 0) {
-        m_tmp = (m_tmp / 8 + 1) * 8;
-    }
-    const int m_padded = m_tmp;
-    if (sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, d_model)) {
-        FT_CHECK(false);
-        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
-        //                         CUBLAS_OP_N,
-        //                         inter_size_,
-        //                         m_padded,
-        //                         d_model,
-        //                         ffn_weights->intermediate_weight.sp_kernel,
-        //                         input_hidden_state,
-        //                         inter_buf_);
-    }
-    else {
-#endif  // SPARSITY_ENABLED
-        if (fp8_mode_ == 1) {
-            const float alpha = 1.0f;
-            const float beta  = 0.0f;
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                ->Gemm(inter_buf_bf16_,
-                       (int)1,
-                       (int)m,
-                       (int)inter_size_,
-                       (int)d_model,
-                       (int64_t)0,
-                       (int64_t)0,
-                       (int64_t)0,
-                       &alpha,
-                       &beta,
-                       input_hidden_state,
-                       ffn_weights->intermediate_weight.kernel,
-                       ffn_weights->intermediate_weight.input_scale,
-                       ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
-                       stream_);
-        }
-        else if (fp8_mode_ == 2) {
-            const float alpha = 1.0f;
-            const float beta  = 0.0f;
-            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                ->Gemm(inter_buf_bf16_,
-                       (int)1,
-                       (int)m,
-                       (int)inter_size_,
-                       (int)d_model,
-                       (int64_t)0,
-                       (int64_t)0,
-                       (int64_t)0,
-                       &alpha,
-                       &beta,
-                       input_hidden_state,
-                       ffn_weights->intermediate_weight.kernel,
-                       ffn_weights->intermediate_weight.input_scale,
-                       ffn_weights->intermediate_weight.weight_scale,
-                       stream_);
-        }
-#ifdef SPARSITY_ENABLED
-    }
-#endif  // SPARSITY_ENABLED
-    POP_RANGE;
-    PUSH_RANGE("FFN add bias act");
-    if (fp8_mode_ == 1) {
-        invokeAddBiasActivation(m,
-                                ffn_weights->intermediate_weight.bias,
-                                ffn_weights->intermediate_weight.output_scale,
-                                ffn_weights->intermediate_weight.scale,
-                                ffn_weights->intermediate_weight.per_channel_scale_min,
-                                ffn_weights->output_weight.input_scale_inv);
-    }
-    else if (fp8_mode_ == 2) {
-        invokeAddBiasActivation(m,
-                                ffn_weights->intermediate_weight.bias,
-                                ffn_weights->intermediate_weight.output_scale,
-                                nullptr,
-                                nullptr,
-                                ffn_weights->output_weight.input_scale_inv);
-    }
-    sync_check_cuda_error();
-    POP_RANGE;
-#endif  // FUSE_GEMM_ACT
-    PUSH_RANGE("FFN gemm 2");
-#ifdef SPARSITY_ENABLED
-    if (sparse_ && cublas_wrapper_->isUseSparse(1, d_model, m, inter_size_)) {
-        FT_CHECK(false);
-        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
-        //                         CUBLAS_OP_N,
-        //                         d_model,
-        //                         m_padded,
-        //                         inter_size_,
-        //                         ffn_weights->output_weight.sp_kernel,
-        //                         inter_buf_,
-        //                         output_tensor);
-    }
-    else {
-#endif SPARSITY_ENABLED
-        if (fp8_mode_ == 1) {
-            const float alpha = 1.0f;
-            const float beta  = 0.0f;
-            if (output_tensor.type == TYPE_BF16) {
-                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                    ->Gemm(output_tensor.getPtr<T2>(),
-                           (int)1,
-                           (int)m,
-                           (int)d_model,
-                           (int)inter_size_,
-                           (int64_t)0,
-                           (int64_t)0,
-                           (int64_t)0,
-                           &alpha,
-                           &beta,
-                           (const __nv_fp8_e4m3*)inter_buf_,
-                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
-                           ffn_weights->output_weight.input_scale,
-                           ffn_weights->identity_scale,
-                           stream_);
-            }
-            else if (output_tensor.type == TYPE_FP8_E4M3) {
-                const float alpha = 1.0f;
-                const float beta  = 0.0f;
-                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                    ->Gemm(output_tensor.getPtr<T1>(),
-                           (int)1,
-                           (int)m,
-                           (int)d_model,
-                           (int)inter_size_,
-                           (int64_t)0,
-                           (int64_t)0,
-                           (int64_t)0,
-                           &alpha,
-                           &beta,
-                           (const __nv_fp8_e4m3*)inter_buf_,
-                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
-                           ffn_weights->output_weight.input_scale,
-                           ffn_weights->output_weight.per_channel_scale_min,
-                           ffn_weights->output_weight.output_scale_inv,
-                           stream_);
-            }
-            else {
-                FT_CHECK(false);
-            }
-        }
-        else if (fp8_mode_ == 2) {
-            if (output_tensor.type == TYPE_BF16) {
-                const float alpha = 1.0f;
-                const float beta  = 0.0f;
-                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                    ->Gemm(output_tensor.getPtr<T2>(),
-                           (int)1,
-                           (int)m,
-                           (int)d_model,
-                           (int)inter_size_,
-                           (int64_t)0,
-                           (int64_t)0,
-                           (int64_t)0,
-                           &alpha,
-                           &beta,
-                           (const __nv_fp8_e4m3*)inter_buf_,
-                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
-                           ffn_weights->output_weight.input_scale,
-                           ffn_weights->output_weight.weight_scale,
-                           stream_);
-            }
-            else if (output_tensor.type == TYPE_FP8_E4M3) {
-                // It looks like conv1x1Gemm does not bring better performance for this gemm
-                // because the k dimension of this gemm is large
-                // #ifdef USE_QGMMA
-                //                 reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                //                     ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
-                //                                                 m,
-                //                                                 d_model,
-                //                                                 inter_size_,
-                //                                                 inter_buf_,
-                //                                                 ffn_weights->output_weight.kernel,
-                //                                                 ffn_weights->output_weight.bias,
-                //                                                 *(ffn_weights->output_weight.input_h_scale),       //
-                //                                                 scale_a,
-                //                                                 *(ffn_weights->output_weight.weight_h_scale),      //
-                //                                                 scale_b,
-                //                                                 *(ffn_weights->output_weight.output_h_scale_inv),  //
-                //                                                 scale_d, stream_);
-                // #else   // USE_QGMMA
-                const float alpha = 1.0f;
-                const float beta  = 0.0f;
-                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
-                    ->Gemm(output_tensor.getPtr<T1>(),
-                           (int)1,
-                           (int)m,
-                           (int)d_model,
-                           (int)inter_size_,
-                           (int64_t)0,
-                           (int64_t)0,
-                           (int64_t)0,
-                           &alpha,
-                           &beta,
-                           (const __nv_fp8_e4m3*)inter_buf_,
-                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
-                           ffn_weights->output_weight.input_scale,
-                           ffn_weights->output_weight.weight_scale,
-                           ffn_weights->output_weight.output_scale_inv,
-                           stream_);
-                // #endif  // USE_QGMMA
-            }
-            else {
-                FT_CHECK(false);
-            }
-        }
-#ifdef SPARSITY_ENABLED
-    }
-#endif  // SPARSITY_ENABLED
-    POP_RANGE;
-    sync_check_cuda_error();
-    if (is_free_buffer_after_forward_ == true) {
-        freeBuffer();
-    }
-    sync_check_cuda_error();
-}
-template<typename T1, typename T2>
-FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t           inter_size,
-                                 int              fp8_mode,
-                                 cudaStream_t     stream,
-                                 cublasMMWrapper* cublas_wrapper,
-                                 IAllocator*      allocator,
-                                 bool             is_free_buffer_after_forward,
-                                 bool             sparse):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
-    inter_size_(inter_size),
-    fp8_mode_(fp8_mode)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-template<typename T1, typename T2>
-FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
-    BaseLayer(ffn_layer.stream_,
-              ffn_layer.cublas_wrapper_,
-              ffn_layer.allocator_,
-              ffn_layer.is_free_buffer_after_forward_,
-              ffn_layer.cuda_device_prop_,
-              ffn_layer.sparse_),
-    inter_size_(ffn_layer.inter_size_),
-    fp8_mode_(ffn_layer.fp8_mode_)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-template<typename T1, typename T2>
-FfnFP8Layer<T1, T2>::~FfnFP8Layer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    cublas_wrapper_ = nullptr;
-    freeBuffer();
-}
-template<typename T1, typename T2>
-void FfnFP8Layer<T1, T2>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-template<typename T1, typename T2>
-void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    inter_buf_          = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
-    inter_buf_bf16_     = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
-    is_allocate_buffer_ = true;
-}
-template<typename T1, typename T2>
-void FfnFP8Layer<T1, T2>::freeBuffer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)(&inter_buf_));
-        allocator_->free((void**)(&inter_buf_bf16_));
-        is_allocate_buffer_ = false;
-    }
-}
-template class FfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
-template<typename T1, typename T2>
-GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(size_t           inter_size,
-                                         int              fp8_mode,
-                                         cudaStream_t     stream,
-                                         cublasMMWrapper* cublas_wrapper,
-                                         IAllocator*      allocator,
-                                         bool             is_free_buffer_after_forward,
-                                         bool             sparse):
-    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
-{
-}
-template<typename T1, typename T2>
-GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& gelu_ffn_layer):
-    FfnFP8Layer<T1, T2>(gelu_ffn_layer)
-{
-}
-template<typename T1, typename T2>
-void GeluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
-                                                      const T2*    bias,
-                                                      const float* input_scale,
-                                                      const float* input_scale_2,
-                                                      const float* input_scale_2_min,
-                                                      const float* output_scale)
-{
-    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
-                                     inter_buf_,
-                                     bias,
-                                     input_scale,
-                                     input_scale_2,
-                                     input_scale_2_min,
-                                     output_scale,
-                                     (uint32_t)m,
-                                     (uint32_t)inter_size_,
-                                     stream_};
-    invokeFP8AddBiasGelu<T1, T2>(param);
-}
-template class GeluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
-template<typename T1, typename T2>
-ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(size_t           inter_size,
-                                         int              fp8_mode,
-                                         cudaStream_t     stream,
-                                         cublasMMWrapper* cublas_wrapper,
-                                         IAllocator*      allocator,
-                                         bool             is_free_buffer_after_forward,
-                                         bool             sparse):
-    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
-{
-}
-template<typename T1, typename T2>
-ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& relu_ffn_layer):
-    FfnFP8Layer<T1, T2>(relu_ffn_layer)
-{
-}
-template<typename T1, typename T2>
-void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
-                                                      const T2*    bias,
-                                                      const float* input_scale,
-                                                      const float* input_scale_2,
-                                                      const float* input_scale_2_min,
-                                                      const float* output_scale)
-{
-    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
-                                     inter_buf_,
-                                     bias,
-                                     input_scale,
-                                     input_scale_2,
-                                     input_scale_2_min,
-                                     output_scale,
-                                     (uint32_t)m,
-                                     (uint32_t)inter_size_,
-                                     stream_};
-    invokeFP8AddBiasRelu<T1, T2>(param);
-}
-template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnFP8Layer.h
+++ b/src/turbomind/layers/FfnFP8Layer.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/layers/FfnFP8Weight.h"
-#include "src/turbomind/layers/FfnLayer.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <vector>
-namespace turbomind {
-template<typename T1, typename T2>
-class FfnFP8Layer: public BaseLayer {
-private:
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    void allocateBuffer(size_t token_num);
-protected:
-    const int    fp8_mode_;
-    T1*          inter_buf_      = nullptr;
-    T2*          inter_buf_bf16_ = nullptr;
-    size_t       inter_size_;
-    virtual void invokeAddBiasActivation(const int    m,
-                                         const T2*    bias,
-                                         const float* input_scale,
-                                         const float* input_scale_2,
-                                         const float* input_scale_2_min,
-                                         const float* output_scale) = 0;
-public:
-    FfnFP8Layer(size_t           inter_size,
-                int              fp8_mode,
-                cudaStream_t     stream,
-                cublasMMWrapper* cublas_wrapper,
-                IAllocator*      allocator,
-                bool             is_free_buffer_after_forward,
-                bool             sparse = false);
-    FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
-    virtual ~FfnFP8Layer();
-    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
-    virtual ActivationType getActivationType() = 0;
-};
-template<typename T1, typename T2>
-class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
-public:
-    GeluFfnFP8Layer(size_t           inter_size,
-                    int              fp8_mode_,
-                    cudaStream_t     stream,
-                    cublasMMWrapper* cublas_wrapper,
-                    IAllocator*      allocator,
-                    bool             is_free_buffer_after_forward,
-                    bool             sparse = false);
-    GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
-    virtual ~GeluFfnFP8Layer() = default;
-    ActivationType getActivationType() override
-    {
-        return ActivationType::Gelu;
-    };
-protected:
-    using FfnFP8Layer<T1, T2>::stream_;
-private:
-    using FfnFP8Layer<T1, T2>::inter_buf_;
-    using FfnFP8Layer<T1, T2>::inter_size_;
-    using FfnFP8Layer<T1, T2>::fp8_mode_;
-    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
-    void invokeAddBiasActivation(const int    m,
-                                 const T2*    bias,
-                                 const float* input_scale,
-                                 const float* input_scale_2,
-                                 const float* input_scale_2_min,
-                                 const float* output_scale) override;
-};
-template<typename T1, typename T2>
-class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
-public:
-    ReluFfnFP8Layer(size_t           inter_size,
-                    int              fp8_mode,
-                    cudaStream_t     stream,
-                    cublasMMWrapper* cublas_wrapper,
-                    IAllocator*      allocator,
-                    bool             is_free_buffer_after_forward,
-                    bool             sparse = false);
-    ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
-    virtual ~ReluFfnFP8Layer() = default;
-    ActivationType getActivationType() override
-    {
-        return ActivationType::Relu;
-    };
-protected:
-    using FfnFP8Layer<T1, T2>::stream_;
-private:
-    using FfnFP8Layer<T1, T2>::inter_buf_;
-    using FfnFP8Layer<T1, T2>::inter_size_;
-    using FfnFP8Layer<T1, T2>::fp8_mode_;
-    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
-    void invokeAddBiasActivation(const int    m,
-                                 const T2*    bias,
-                                 const float* input_scale,
-                                 const float* input_scale_2,
-                                 const float* input_scale_2_min,
-                                 const float* output_scale) override;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnFP8Weight.h
+++ b/src/turbomind/layers/FfnFP8Weight.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "FfnWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-template<typename T1, typename T2>
-struct FfnFP8Weight: FfnWeight<T1, T2> {
-    ScaleList* scale_list_ptr;
-    float*     identity_scale;
-    float*     identity_h_scale;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnINT8Weight.h
+++ b/src/turbomind/layers/FfnINT8Weight.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "FfnWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-template<typename T>
-struct FfnINT8Weight: FfnWeight<T> {
-    ScaleList* scale_list_ptr;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnLayerINT8.cc
+++ b/src/turbomind/layers/FfnLayerINT8.cc
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "FfnLayerINT8.h"
-#include "src/turbomind/utils/nvtx_utils.h"
-namespace turbomind {
-template<typename T>
-void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>*       output_tensors,
-                              const std::vector<turbomind::Tensor>* input_tensors,
-                              const FfnWeight<T>*                   ffn_weights)
-{
-    // input_tensors: [input (token_num, hidden_dimension)]
-    // output_tensors: [output (token_num, hidden_dimension)]
-    ScaleList* scale_list = ((const FfnINT8Weight<T>*)ffn_weights)->scale_list_ptr;
-    cublasINT8MMWrapper* cublas_wrapper = (cublasINT8MMWrapper*)cublas_wrapper_;
-    FT_CHECK(isValidTokenNum(input_tensors->at(0).shape[0]));
-    allocateBuffer();
-    const int m = static_cast<int>(input_tensors->at(0).shape[0]);
-#ifdef SPARSITY_ENABLED
-    int m_tmp = m;
-    if (m_tmp % 16 != 0) {
-        m_tmp = (m_tmp / 16 + 1) * 16;
-    }
-    const int m_padded = m_tmp;
-#endif
-    int32_t*      output_tensor = output_tensors->at(0).getPtr<int32_t>();
-    const int8_t* input_tensor  = input_tensors->at(0).getPtr<const int8_t>();
-    PUSH_RANGE("FFN gemm 1");
-    if (int8_mode_ == 1) {
-        cublas_wrapper->Gemm(inter_int_buf_,
-                             1,
-                             m,
-                             inter_size_,
-                             hidden_units_,
-                             0,
-                             0,
-                             0,
-                             input_tensor,
-                             (int8_t*)(ffn_weights->intermediate_weight.kernel));
-    }
-    else if (int8_mode_ == 2 || int8_mode_ == 3) {
-#ifdef SPARSITY_ENABLED
-        if (sparse_) {
-            cublas_wrapper->SpGemm(inter_size_,
-                                   m_padded,
-                                   hidden_units_,
-                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
-                                   (int8_t*)(ffn_weights->intermediate_weight.sp_kernel),
-                                   input_tensor,
-                                   (int8_t*)inter_int_buf_);
-        }
-        else {
-#endif
-            cublas_wrapper->Gemm((int8_t*)inter_int_buf_,
-                                 1,
-                                 m,
-                                 inter_size_,
-                                 hidden_units_,
-                                 0,
-                                 0,
-                                 0,
-                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
-                                 input_tensor,
-                                 (int8_t*)(ffn_weights->intermediate_weight.kernel));
-#ifdef SPARSITY_ENABLED
-        }
-#endif
-    }
-    POP_RANGE;
-    PUSH_RANGE("add bias act");
-    invokeAddBiasActivation(m, ffn_weights->intermediate_weight.bias, scale_list);
-    POP_RANGE;
-    sync_check_cuda_error();
-    PUSH_RANGE("FFN gemm 2");
-    if (int8_mode_ == 1) {
-        cublas_wrapper->Gemm(output_tensor,
-                             1,
-                             m,
-                             hidden_units_,
-                             inter_size_,
-                             0,
-                             0,
-                             0,
-                             inter_buf_,
-                             (int8_t*)(ffn_weights->output_weight.kernel));
-    }
-    else if (int8_mode_ == 2 || int8_mode_ == 3) {
-#ifdef SPARSITY_ENABLED
-        if (sparse_) {
-            cublas_wrapper->SpGemm(hidden_units_,
-                                   m_padded,
-                                   inter_size_,
-                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
-                                   (int8_t*)(ffn_weights->output_weight.sp_kernel),
-                                   inter_buf_,
-                                   (int8_t*)output_tensor);
-        }
-        else {
-#endif
-            cublas_wrapper->Gemm((int8_t*)output_tensor,
-                                 1,
-                                 m,
-                                 hidden_units_,
-                                 inter_size_,
-                                 0,
-                                 0,
-                                 0,
-                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
-                                 inter_buf_,
-                                 (int8_t*)(ffn_weights->output_weight.kernel));
-#ifdef SPARSITY_ENABLED
-        }
-#endif
-    }
-    POP_RANGE;
-    sync_check_cuda_error();
-    if (is_free_buffer_after_forward_ == true) {
-        freeBuffer();
-    }
-    sync_check_cuda_error();
-}
-template<typename T>
-FfnLayerINT8<T>::FfnLayerINT8(size_t           max_batch_size,
-                              size_t           max_seq_len,
-                              size_t           head_num,
-                              size_t           size_per_head,
-                              size_t           inter_size,
-                              int              int8_mode,
-                              cudaStream_t     stream,
-                              cublasMMWrapper* cublas_wrapper,
-                              IAllocator*      allocator,
-                              bool             is_free_buffer_after_forward,
-                              bool             sparse):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
-    max_token_num_(max_batch_size * max_seq_len),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(head_num * size_per_head),
-    inter_size_(inter_size),
-    int8_mode_(int8_mode),
-    sparse_(sparse)
-{
-}
-template<typename T>
-FfnLayerINT8<T>::FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer):
-    BaseLayer(
-        ffn_layer.stream_, ffn_layer.cublas_wrapper_, ffn_layer.allocator_, ffn_layer.is_free_buffer_after_forward_),
-    max_token_num_(ffn_layer.max_token_num_),
-    head_num_(ffn_layer.head_num_),
-    size_per_head_(ffn_layer.size_per_head_),
-    hidden_units_(ffn_layer.hidden_units_),
-    inter_size_(ffn_layer.inter_size_),
-    int8_mode_(ffn_layer.int8_mode_),
-    sparse_(ffn_layer.sparse_)
-{
-}
-template<typename T>
-FfnLayerINT8<T>::~FfnLayerINT8()
-{
-    cublas_wrapper_ = nullptr;
-    freeBuffer();
-}
-template<typename T>
-void FfnLayerINT8<T>::allocateBuffer()
-{
-    if (is_allocate_buffer_ == false) {
-        inter_int_buf_ =
-            (int32_t*)allocator_->reMalloc(inter_int_buf_, sizeof(int32_t) * max_token_num_ * inter_size_, false);
-        inter_buf_ = (int8_t*)allocator_->reMalloc(inter_buf_, sizeof(int8_t) * max_token_num_ * inter_size_, false);
-        is_allocate_buffer_ = true;
-    }
-}
-template<typename T>
-void FfnLayerINT8<T>::freeBuffer()
-{
-    if (is_allocate_buffer_ == true) {
-        allocator_->free((void**)(&inter_int_buf_));
-        allocator_->free((void**)(&inter_buf_));
-        is_allocate_buffer_ = false;
-    }
-}
-template<typename T>
-bool FfnLayerINT8<T>::isValidTokenNum(size_t token_num)
-{
-    if (max_token_num_ == 0) {
-        max_token_num_ = token_num;
-        return true;
-    }
-    else {
-        return token_num <= max_token_num_;
-    }
-}
-template class FfnLayerINT8<float>;
-template class FfnLayerINT8<half>;
-template<typename T>
-GeluFfnLayerINT8<T>::GeluFfnLayerINT8(size_t           max_batch_size,
-                                      size_t           max_seq_len,
-                                      size_t           head_num,
-                                      size_t           size_per_head,
-                                      size_t           inter_size,
-                                      int              int8_mode,
-                                      cudaStream_t     stream,
-                                      cublasMMWrapper* cublas_wrapper,
-                                      IAllocator*      allocator,
-                                      bool             is_free_buffer_after_forward,
-                                      bool             sparse):
-    FfnLayerINT8<T>(max_batch_size,
-                    max_seq_len,
-                    head_num,
-                    size_per_head,
-                    inter_size,
-                    int8_mode,
-                    stream,
-                    cublas_wrapper,
-                    allocator,
-                    is_free_buffer_after_forward,
-                    sparse)
-{
-}
-template<typename T>
-GeluFfnLayerINT8<T>::GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& gelu_ffn_layer): FfnLayerINT8<T>(gelu_ffn_layer)
-{
-}
-template<typename T>
-void GeluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
-{
-    if (int8_mode_ == 1) {
-        invokeAddBiasGeluCol32<T>(inter_buf_,
-                                  inter_int_buf_,
-                                  bias,
-                                  m,
-                                  inter_size_,
-                                  stream_,
-                                  &(scale_list->d_scale_list_[scale_list->p2_offset_ + 4 * hidden_units_]),
-                                  &(scale_list->d_scale_list_[44 + 2]),
-                                  &(scale_list->d_scale_list_[52 + 3]));
-    }
-    else if (int8_mode_ == 2 || int8_mode_ == 3) {
-#ifdef SPARSITY_ENABLED
-        if (sparse_) {
-            invokeAddBiasGeluRow<T>(inter_buf_,
-                                    (const int8_t*)inter_int_buf_,
-                                    bias,
-                                    m,
-                                    inter_size_,
-                                    stream_,
-                                    &(scale_list->d_scale_list_[48 + 1]),
-                                    &(scale_list->d_scale_list_[52 + 3]));
-        }
-        else {
-#endif
-            invokeAddBiasGeluCol32<T>(inter_buf_,
-                                      (const int8_t*)inter_int_buf_,
-                                      bias,
-                                      m,
-                                      inter_size_,
-                                      stream_,
-                                      &(scale_list->d_scale_list_[48 + 1]),
-                                      &(scale_list->d_scale_list_[52 + 3]));
-#ifdef SPARSITY_ENABLED
-        }
-#endif
-    }
-}
-template class GeluFfnLayerINT8<float>;
-template class GeluFfnLayerINT8<half>;
-template<typename T>
-ReluFfnLayerINT8<T>::ReluFfnLayerINT8(size_t           max_batch_size,
-                                      size_t           max_seq_len,
-                                      size_t           head_num,
-                                      size_t           size_per_head,
-                                      size_t           inter_size,
-                                      int              int8_mode,
-                                      cudaStream_t     stream,
-                                      cublasMMWrapper* cublas_wrapper,
-                                      IAllocator*      allocator,
-                                      bool             is_free_buffer_after_forward):
-    FfnLayerINT8<T>(max_batch_size,
-                    max_seq_len,
-                    head_num,
-                    size_per_head,
-                    inter_size,
-                    int8_mode,
-                    stream,
-                    cublas_wrapper,
-                    allocator,
-                    is_free_buffer_after_forward)
-{
-}
-template<typename T>
-ReluFfnLayerINT8<T>::ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& relu_ffn_layer): FfnLayerINT8<T>(relu_ffn_layer)
-{
-}
-template<typename T>
-void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
-{
-    // TODO
-}
-template class ReluFfnLayerINT8<float>;
-template class ReluFfnLayerINT8<half>;
-}  // namespace turbomind
--- a/src/turbomind/layers/FfnLayerINT8.h
+++ b/src/turbomind/layers/FfnLayerINT8.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "FfnINT8Weight.h"
-#include "src/turbomind/kernels/activation_int8_kernels.h"
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/utils/ScaleList.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasINT8MMWrapper.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <vector>
-namespace turbomind {
-template<typename T>
-class GeluFfnLayerINT8;
-template<typename T>
-class ReluFfnLayerINT8;
-template<typename T>
-class FfnLayerINT8: public BaseLayer {
-private:
-    // buffer handling
-    size_t max_token_num_ = 0;
-    // meta data
-    size_t head_num_;
-    size_t size_per_head_;
-    // calculated data
-    size_t hidden_units_;
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    bool isValidTokenNum(size_t token_num);
-protected:
-    size_t inter_size_;
-    int    int8_mode_;
-    bool   sparse_;
-    int*         inter_int_buf_;
-    int8_t*      inter_buf_;
-    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
-public:
-    FfnLayerINT8(size_t           max_batch_size,
-                 size_t           max_seq_len,
-                 size_t           head_num,
-                 size_t           size_per_head,
-                 size_t           inter_size,
-                 int              int8_mode,
-                 cudaStream_t     stream,
-                 cublasMMWrapper* cublas_wrapper,
-                 IAllocator*      allocator,
-                 bool             is_free_buffer_after_forward,
-                 bool             sparse = false);
-    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
-    ~FfnLayerINT8();
-    void forward(std::vector<turbomind::Tensor>*       output_tensors,
-                 const std::vector<turbomind::Tensor>* input_tensors,
-                 const FfnWeight<T>*                   ffn_weights);
-    friend GeluFfnLayerINT8<T>;
-    friend ReluFfnLayerINT8<T>;
-};
-template<typename T>
-class GeluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    GeluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward,
-                     bool             sparse = false);
-    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
-    ~GeluFfnLayerINT8() = default;
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::sparse_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-template<typename T>
-class ReluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    ReluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward);
-    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
-    ~ReluFfnLayerINT8() = default;
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
+++ b/src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-template<typename T1, typename T2>
-struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
-    const float* qk_scale;
-    const float* qk_scale_inv;
-    float*       qk_h_scale;
-    float*       qk_h_scale_inv;
-    float*       identity_scale;
-    float*       identity_h_scale;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
+++ b/src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <assert.h>
-#include <vector>
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
-#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasFP8MMWrapper.h"
-#include "src/turbomind/utils/memory_utils.h"
-namespace turbomind {
-// template<typename T>
-// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
-// const bool is_fuse = true)
-// {
-//     if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
-//     kSM_72)
-//         && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
-//         return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
-//     }
-//     else {
-//         return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
-//     }
-// }
-template<typename T1, typename T2>
-class BaseAttentionFP8Layer: public BaseLayer {
-public:
-    virtual void forward(TensorMap*                        output_tensors,
-                         TensorMap*                        input_tensors,
-                         const AttentionFP8Weight<T1, T2>* attention_weights) = 0;
-    BaseAttentionFP8Layer(cudaStream_t     stream,
-                          cublasMMWrapper* cublas_wrapper,
-                          IAllocator*      allocator,
-                          bool             is_free_buffer_after_forward,
-                          bool             sparse = false):
-        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
-    {
-    }
-    virtual ~BaseAttentionFP8Layer() = default;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
+++ b/src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8)
--- a/src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
+++ b/src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
-#include "src/turbomind/utils/ScaleList.h"
-namespace turbomind {
-template<typename T>
-struct AttentionINT8Weight: AttentionWeight<T> {
-    ScaleList* scale_list_ptr;
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/attention_layers_int8/CMakeLists.txt
+++ b/src/turbomind/layers/attention_layers_int8/CMakeLists.txt
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8)
--- a/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
-#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
-#include "src/turbomind/utils/cuda_utils.h"
-namespace turbomind {
-__global__ void update_indir_cache_kernel(int*        tgt_indir_cache,
-                                          const int*  src_indir_cache,
-                                          const int*  beam_ids,
-                                          const bool* finished,
-                                          int         start_step,
-                                          int         batch_dim,
-                                          int         local_batch_size,
-                                          int         beam_width,
-                                          int         max_seq_len,
-                                          int         step)
-{
-    int       time_step = threadIdx.x + blockIdx.x * blockDim.x;
-    int       bb_id     = threadIdx.y + blockIdx.y * blockDim.y;
-    const int batch_id  = bb_id / beam_width;
-    const int beam_id   = bb_id % beam_width;
-    if (bb_id >= beam_width * local_batch_size || time_step >= min(step + 1, max_seq_len) || finished[bb_id]) {
-        return;
-    }
-    time_step += start_step;
-    const int time_step_circ = time_step % max_seq_len;
-    const int src_beam = beam_ids[batch_id * beam_width + beam_id];
-    const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
-    const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
-    tgt_indir_cache[tgt_offset] = (time_step == step) ? beam_id : src_indir_cache[src_offset];
-}
-void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
-                                       const int*   src_indir_cache,
-                                       const int*   beam_ids,
-                                       const bool*  finished,
-                                       int          batch_dim,
-                                       int          local_batch_size,
-                                       int          beam_width,
-                                       int          max_seq_len,
-                                       int          step,
-                                       cudaStream_t stream)
-{
-    const dim3 block(32);
-    const int  start_step = max(0, step + 1 - max_seq_len);
-    const int  num_steps  = min(step + 1, max_seq_len);
-    // Update indirections steps [start_step, step], included
-    const dim3 grid((num_steps + block.x - 1) / block.x, local_batch_size * beam_width);
-    update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
-                                                          src_indir_cache,
-                                                          beam_ids,
-                                                          finished,
-                                                          start_step,
-                                                          batch_dim,
-                                                          local_batch_size,
-                                                          beam_width,
-                                                          max_seq_len,
-                                                          step);
-}
-template<typename T>
-BaseBeamSearchLayer<T>::BaseBeamSearchLayer(size_t           max_batch_size,
-                                            size_t           head_num,
-                                            size_t           size_per_head,
-                                            size_t           beam_width,
-                                            size_t           vocab_size,
-                                            size_t           vocab_size_padded,
-                                            int              end_id,
-                                            float            diversity_rate,
-                                            float            temperature,
-                                            float            len_penalty,
-                                            float            repetition_penalty,
-                                            cudaStream_t     stream,
-                                            cublasMMWrapper* cublas_wrapper,
-                                            IAllocator*      allocator,
-                                            bool             is_free_buffer_after_forward):
-    DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size_padded)
-{
-}
-template<typename T>
-BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer):
-    DynamicDecodeBaseLayer(beam_search_layer),
-    vocab_size_(beam_search_layer.vocab_size_),
-    vocab_size_padded_(beam_search_layer.vocab_size_padded_),
-    topk_softmax_workspace_size_(beam_search_layer.topk_softmax_workspace_size_)
-{
-}
-template<typename T>
-BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    freeBuffer();
-}
-template<typename T>
-void BaseBeamSearchLayer<T>::freeBuffer()
-{
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)(&topk_softmax_workspace_));
-        is_allocate_buffer_ = false;
-    }
-}
-template<typename T>
-void BaseBeamSearchLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
-{
-    // do nothing.
-}
-template<typename T>
-void BaseBeamSearchLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
-{
-    // input_tensors:
-    //      logits [local_batch_size, beam_width, vocab_size_padded]
-    //      embedding_bias [vocab_size_padded]
-    //      step [1] on cpu
-    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      max_input_length [1] on cpu
-    //      input_lengths [local_batch_size * beam_width]
-    //      ite [1] on cpu
-    // output_tensors:
-    //      output_ids [max_seq_len, batch_size, beam_width]
-    //      finished [local_batch_size * beam_width]
-    //      cum_log_probs [local_batch_size * beam_width]
-    //      parent_ids [max_seq_len, batch_size * beam_width]
-    //      sequence_length [local_batch_size * beam_width]
-    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
-                                                              {"embedding_bias", input_tensors->at(1)},
-                                                              {"step", input_tensors->at(2)},
-                                                              {"src_cache_indirection", input_tensors->at(4)},
-                                                              {"max_input_length", input_tensors->at(5)},
-                                                              {"input_lengths", input_tensors->at(6)},
-                                                              {"ite", input_tensors->at(7)}};
-    std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
-                                                               {"finished", output_tensors->at(1)},
-                                                               {"cum_log_probs", output_tensors->at(2)},
-                                                               {"parent_ids", output_tensors->at(3)},
-                                                               {"sequence_length", output_tensors->at(4)},
-                                                               {"tgt_cache_indirection", output_tensors->at(5)}};
-    forward(&output_tensors_map, &input_tensors_map);
-}
-template<typename T>
-void BaseBeamSearchLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                     const std::unordered_map<std::string, Tensor>* input_tensors)
-{
-    TensorMap input_map(*input_tensors);
-    TensorMap output_map(*output_tensors);
-    forward(&output_map, &input_map);
-}
-template<typename T>
-void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
-{
-    // input_tensors:
-    //      logits [local_batch_size, beam_width, vocab_size_padded]
-    //      embedding_bias [vocab_size_padded]
-    //      step [1] on cpu
-    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      end_id [local_batch_size]
-    //      max_input_length [1] on cpu
-    //      input_lengths [local_batch_size * beam_width], optional
-    //      ite [1] on cpu
-    //      beam_search_diversity_rate [1] on cpu, optional
-    //      temperature [1] on cpu, optional
-    //      len_penalty [1] on cpu, optional
-    //      repetition_penalty [1] on cpu, optional
-    //      presence_penalty [1] on cpu, optional
-    //          Only one of repetition and presence penalties is allowed.
-    //      min_length [1] on cpu, int, optional
-    // output_tensors:
-    //      output_ids [max_seq_len, batch_size, beam_width]
-    //      finished [local_batch_size * beam_width], optional
-    //      cum_log_probs [local_batch_size * beam_width]
-    //      parent_ids [max_seq_len, batch_size * beam_width]
-    //      sequence_length [local_batch_size * beam_width], optional
-    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      output_log_probs [max_seq_len, batch_size, beam_width], optional
-    //      beam_hyps, optional
-    FT_CHECK(input_tensors->size() >= 7);
-    FT_CHECK(output_tensors->size() >= 5);
-    const int batch_size = output_tensors->at("output_ids").shape[1];
-    const int beam_width = output_tensors->at("output_ids").shape[2];
-    allocateBuffer(batch_size, beam_width);
-    const int step             = input_tensors->at("step").getVal<int>();
-    const int ite              = input_tensors->at("ite").getVal<int>();
-    const int local_batch_size = input_tensors->at("logits").shape[0];
-    const float temperature    = input_tensors->getVal<float>("temperature", 1.0f);
-    const T*    embedding_bias = input_tensors->getPtr<const T>("embedding_bias", nullptr);
-    RepetitionPenaltyType repetition_penalty_type = RepetitionPenaltyType::None;
-    float                 repetition_penalty      = getDefaultPenaltyValue(repetition_penalty_type);
-    if (input_tensors->isExist("repetition_penalty") || input_tensors->isExist("presence_penalty")) {
-        FT_CHECK_WITH_INFO(
-            !(input_tensors->isExist("repetition_penalty") && input_tensors->isExist("presence_penalty")),
-            "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
-            "Please provide one of repetition_penalty or presence_penalty.");
-        repetition_penalty_type = input_tensors->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
-                                                                                 RepetitionPenaltyType::Additive;
-        repetition_penalty      = repetition_penalty_type == RepetitionPenaltyType::Multiplicative ?
-                                      input_tensors->getVal<float>("repetition_penalty") :
-                                      input_tensors->getVal<float>("presence_penalty");
-    }
-    invokeAddBiasApplyPenalties(
-        step,
-        input_tensors->at("logits").getPtr<T>(),
-        output_tensors->at("output_ids")
-            .getPtrWithOffset<const int>((step - 1) * batch_size * beam_width + ite * local_batch_size * beam_width),
-        output_tensors->getPtr<const int>("output_ids"),
-        output_tensors->getPtr<const int>("parent_ids"),
-        input_tensors->getPtr<const int>("input_lengths", nullptr),
-        output_tensors->getPtr<const int>("sequence_length", nullptr),
-        embedding_bias,
-        ite,
-        input_tensors->getVal<int>("max_input_length"),
-        local_batch_size,
-        batch_size,
-        beam_width,
-        vocab_size_,
-        vocab_size_padded_,
-        input_tensors->getPtr<const int>("end_id", nullptr),
-        temperature,
-        repetition_penalty,
-        repetition_penalty_type,
-        input_tensors->getVal<const int>("min_length", 0),
-        stream_);
-    sync_check_cuda_error();
-    invokeSoftMax(output_tensors, input_tensors);
-    if (beam_width > 1) {
-        const int max_seq_len = output_tensors->at("output_ids").shape[0];
-        update_indir_cache_kernelLauncher(
-            output_tensors->at("tgt_cache_indirection").getPtr<int>(),
-            input_tensors->at("src_cache_indirection").getPtr<const int>(),
-            output_tensors->at("parent_ids")
-                .getPtrWithOffset<const int>(+step * beam_width * batch_size + ite * local_batch_size * beam_width),
-            output_tensors->at("finished").getPtr<const bool>(),
-            batch_size,
-            local_batch_size,
-            beam_width,
-            max_seq_len,
-            step,
-            stream_);
-        sync_check_cuda_error();
-    }
-    sync_check_cuda_error();
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
-    sync_check_cuda_error();
-}
-template class BaseBeamSearchLayer<float>;
-template class BaseBeamSearchLayer<half>;
-}  // namespace turbomind
--- a/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/kernels/penalty_types.h"
-#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
-namespace turbomind {
-template<typename T>
-class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
-private:
-    void freeBuffer();
-protected:
-    // meta data
-    size_t vocab_size_;
-    size_t vocab_size_padded_;
-    size_t topk_softmax_workspace_size_;
-    void*  topk_softmax_workspace_ = nullptr;
-    virtual void allocateBuffer()                                                   = 0;
-    virtual void allocateBuffer(size_t batch_size, size_t beam_width)               = 0;
-    virtual void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
-public:
-    BaseBeamSearchLayer(size_t           max_batch_size,
-                        size_t           head_num,
-                        size_t           size_per_head,
-                        size_t           beam_width,
-                        size_t           vocab_size,
-                        size_t           vocab_size_padded,
-                        int              end_id,
-                        float            diversity_rate,
-                        float            temperature,
-                        float            len_penalty,
-                        float            repetition_penalty,
-                        cudaStream_t     stream,
-                        cublasMMWrapper* cublas_wrapper,
-                        IAllocator*      allocator,
-                        bool             is_free_buffer_after_forward);
-    BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer);
-    ~BaseBeamSearchLayer();
-    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
-    void forward(std::vector<turbomind::Tensor>*       output_tensors,
-                 const std::vector<turbomind::Tensor>* input_tensors) override;
-    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
-                 const std::unordered_map<std::string, Tensor>* input_tensors) override;
-    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
-};
-void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
-                                       const int*   src_indir_cache,
-                                       const int*   beam_ids,
-                                       const bool*  finished,
-                                       int          batch_dim,
-                                       int          beam_width,
-                                       int          max_seq_len,
-                                       int          ite,
-                                       cudaStream_t stream);
-}  // namespace turbomind
--- a/src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
-namespace turbomind {
-template<typename T>
-__global__ void logProbAddCumLogProb(float*       log_probs,
-                                     const T*     logits,
-                                     const float* cum_log_probs,
-                                     const int*   end_ids,
-                                     const bool*  finished,
-                                     const int    beam_width,
-                                     const int    n)
-{
-    int  bid    = blockIdx.x;
-    bool finish = finished != nullptr ? finished[bid] : false;
-    int  offset = bid * n;
-    float            max_val = -1 * FLT_MAX;
-    __shared__ float s_max_val;
-    __shared__ float s_sum_val;
-    if (finish) {
-        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-            log_probs[offset + tid] = (tid == end_ids[bid / beam_width]) ? cum_log_probs[bid] : -FLT_MAX;
-        }
-    }
-    else {
-        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-            log_probs[offset + tid] = (float)(logits[offset + tid]);
-            max_val                 = max(max_val, log_probs[offset + tid]);
-        }
-        max_val = blockReduceMax(max_val);
-        if (threadIdx.x == 0) {
-            s_max_val = max_val;
-        }
-        __syncthreads();
-        float sum_val = 0.0f;
-        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-            log_probs[offset + tid] = __expf(log_probs[offset + tid] - s_max_val);
-            sum_val += log_probs[offset + tid];
-        }
-        sum_val = blockReduceSum(sum_val);
-        if (threadIdx.x == 0) {
-            s_sum_val = sum_val + 1e-6f;
-        }
-        __syncthreads();
-        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-            log_probs[offset + tid] = logf(log_probs[offset + tid] / s_sum_val) + cum_log_probs[bid];
-        }
-    }
-}
-template<typename T>
-void invokeLogProbAddCumLogProb(float*       log_probs,
-                                const T*     logits,
-                                const float* cum_log_probs,
-                                const int*   end_ids,
-                                const bool*  finished,
-                                const int    m,
-                                const int    beam_width,
-                                const int    n,
-                                cudaStream_t stream)
-{
-    dim3 grid(m);
-    dim3 block(min(n, 1024));
-    /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
-    logProbAddCumLogProb<<<grid, block, 0, stream>>>(
-        log_probs, logits, cum_log_probs, end_ids, finished, beam_width, n);
-}
-template<typename T>
-__global__ void updateStatesKernel(T*             log_probs,
-                                   T*             cum_log_probs,
-                                   float*         output_log_probs,
-                                   bool*          finished,
-                                   int*           parent_ids,
-                                   int*           sequence_length,
-                                   int*           word_ids,
-                                   int*           output_ids,
-                                   BeamHypotheses beam_hyps,
-                                   const int      local_batch_size,
-                                   const int      beam_width,
-                                   const int      vocab_size,
-                                   const int*     end_ids)
-{
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
-         index += blockDim.x * gridDim.x) {
-        int batch_id           = index / beam_width;
-        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
-        int beam_id = (word_ids[index] / vocab_size) % beam_width;
-        int word_id = word_ids[index] % vocab_size;
-        if (output_log_probs != nullptr) {
-            // get the cum_log_probs of previous run
-            output_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id]
-                                      - cum_log_probs[batch_id * beam_width + beam_id];
-        }
-        cum_log_probs[index]   = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id];
-        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
-        finished[index]        = word_id == end_ids[batch_id] ? 1 : 0;
-        parent_ids[index]      = beam_id;
-        word_ids[index]        = word_id;
-        output_ids[index]      = word_id;
-        if (beam_hyps.num_beams != nullptr) {
-            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
-                for (int i = 0; i < beam_width; i++) {
-                    finished[batch_id * beam_width + i] = true;
-                }
-            }
-        }
-    }
-}
-void invokeUpdateStates(float*          log_probs,
-                        float*          cum_log_probs,
-                        float*          output_log_probs,
-                        bool*           finished,
-                        int*            parent_ids,
-                        int*            sequence_length,
-                        int*            word_ids,
-                        int*            output_ids,
-                        BeamHypotheses* beam_hyps,
-                        const int       local_batch_size,
-                        const int       beam_width,
-                        const int       vocab_size,
-                        const int*      end_ids,
-                        cudaStream_t    stream)
-{
-    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
-    dim3 block(256);
-    updateStatesKernel<float><<<grid, block, 0, stream>>>(log_probs,
-                                                          cum_log_probs,
-                                                          output_log_probs,
-                                                          finished,
-                                                          parent_ids,
-                                                          sequence_length,
-                                                          word_ids,
-                                                          output_ids,
-                                                          *beam_hyps,
-                                                          local_batch_size,
-                                                          beam_width,
-                                                          vocab_size,
-                                                          end_ids);
-}
-template<typename T>
-void BeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
-{
-    // input_tensors:
-    //      logits [local_batch_size, beam_width, vocab_size_padded]
-    //      embedding_bias [vocab_size_padded]
-    //      step [1] on cpu
-    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      max_input_length [1] on cpu
-    //      input_lengths [local_batch_size * beam_width]
-    //      ite [1] on cpu
-    //      beam_search_diversity_rate [1] on cpu, optional
-    //      temperature [1] on cpu, optional
-    //      len_penalty [1] on cpu, optional
-    //      repetition_penalty [1] on cpu, optional
-    // output_tensors:
-    //      output_ids [max_seq_len, batch_size, beam_width]
-    //      finished [local_batch_size * beam_width]
-    //      cum_log_probs [local_batch_size * beam_width]
-    //      parent_ids [max_seq_len, batch_size * beam_width]
-    //      sequence_length [local_batch_size * beam_width]
-    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      output_log_probs [max_seq_len, batch_size * beam_width], optional
-    //      beam_hyps, optional
-    FT_CHECK(input_tensors->size() >= 7);
-    FT_CHECK(output_tensors->size() >= 6);
-    const int   batch_size       = output_tensors->at("output_ids").shape[1];
-    const int   beam_width       = output_tensors->at("output_ids").shape[2];
-    const int   step             = input_tensors->at("step").getVal<int>();
-    const int   ite              = input_tensors->at("ite").getVal<int>();
-    const int   local_batch_size = input_tensors->at("logits").shape[0];
-    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
-                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
-                                       0.0f;
-    const float length_penalty =
-        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
-    const int id_offset = step * batch_size * beam_width + ite * local_batch_size * beam_width;
-    invokeLogProbAddCumLogProb(float_log_prob_buf_,
-                               input_tensors->at("logits").getPtr<T>(),
-                               output_tensors->at("cum_log_probs").getPtr<float>(),
-                               input_tensors->at("end_id").getPtr<const int>(),
-                               output_tensors->at("finished").getPtr<bool>(),
-                               local_batch_size * beam_width,
-                               beam_width,
-                               vocab_size_padded_,
-                               stream_);
-    sync_check_cuda_error();
-    BeamHypotheses beam_hyps;
-    if (output_tensors->isExist("beam_hyps") && diversity_rate == 0.0f) {
-        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
-        beam_hyps.step                 = step;
-        beam_hyps.ite                  = ite;
-        beam_hyps.local_batch_size     = local_batch_size;
-        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
-        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
-        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
-        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
-        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
-        beam_hyps.length_penalty       = length_penalty;
-    }
-    invokeTopkBeamSearch<float>(topk_softmax_workspace_,
-                                topk_softmax_workspace_size_,
-                                float_log_prob_buf_,
-                                output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                                &beam_hyps,
-                                output_tensors->at("finished").getPtr<bool>(),
-                                output_tensors->isExist("sequence_length") ?
-                                    output_tensors->at("sequence_length").getPtr<int>() :
-                                    (int*)nullptr,
-                                local_batch_size,
-                                beam_width,
-                                vocab_size_padded_,
-                                diversity_rate,
-                                length_penalty,
-                                input_tensors->at("end_id").getPtr<const int>(),
-                                stream_);
-    sync_check_cuda_error();
-    invokeUpdateStates(float_log_prob_buf_,
-                       output_tensors->at("cum_log_probs").getPtr<float>(),
-                       output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
-                       output_tensors->at("finished").getPtr<bool>(),
-                       output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
-                       output_tensors->at("sequence_length").getPtr<int>(),
-                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                       &beam_hyps,
-                       local_batch_size,
-                       beam_width,
-                       vocab_size_padded_,
-                       input_tensors->at("end_id").getPtr<const int>(),
-                       stream_);
-    sync_check_cuda_error();
-}
-template<typename T>
-void BeamSearchLayer<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-template<typename T>
-void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    invokeTopkBeamSearch<float>(nullptr,
-                                topk_softmax_workspace_size_,
-                                nullptr,
-                                nullptr,
-                                nullptr,
-                                nullptr,
-                                nullptr,
-                                batch_size,
-                                beam_width,
-                                vocab_size_padded_,
-                                0.0f,  // diversity rate
-                                0.0f,  // length penalty
-                                nullptr,
-                                stream_);
-    topk_softmax_workspace_ = reinterpret_cast<float*>(allocator_->reMalloc(
-        topk_softmax_workspace_,
-        topk_softmax_workspace_size_ + sizeof(float) * batch_size * beam_width * vocab_size_padded_,
-        false));
-    float_log_prob_buf_     = (float*)((char*)topk_softmax_workspace_ + topk_softmax_workspace_size_);
-    is_allocate_buffer_     = true;
-}
-template<typename T>
-BeamSearchLayer<T>::BeamSearchLayer(size_t           max_batch_size,
-                                    size_t           head_num,
-                                    size_t           size_per_head,
-                                    size_t           beam_width,
-                                    size_t           vocab_size,
-                                    size_t           vocab_size_padded,
-                                    int              end_id,
-                                    float            diversity_rate,
-                                    float            temperature,
-                                    float            len_penalty,
-                                    float            repetition_penalty,
-                                    cudaStream_t     stream,
-                                    cublasMMWrapper* cublas_wrapper,
-                                    IAllocator*      allocator,
-                                    bool             is_free_buffer_after_forward):
-    BaseBeamSearchLayer<T>(max_batch_size,
-                           head_num,
-                           size_per_head,
-                           beam_width,
-                           vocab_size,
-                           vocab_size_padded,
-                           end_id,
-                           diversity_rate,
-                           temperature,
-                           len_penalty,
-                           repetition_penalty,
-                           stream,
-                           cublas_wrapper,
-                           allocator,
-                           is_free_buffer_after_forward)
-{
-}
-template<typename T>
-BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer):
-    BaseBeamSearchLayer<T>(beam_search_layer)
-{
-}
-template<typename T>
-BeamSearchLayer<T>::~BeamSearchLayer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-template class BeamSearchLayer<float>;
-template class BeamSearchLayer<half>;
-}  // namespace turbomind
--- a/src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/kernels/beam_search_topk_kernels.h"
-#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
-#include <float.h>
-namespace turbomind {
-template<typename T>
-class BeamSearchLayer: public BaseBeamSearchLayer<T> {
-private:
-    // meta data
-    using BaseBeamSearchLayer<T>::vocab_size_;
-    using BaseBeamSearchLayer<T>::vocab_size_padded_;
-    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
-    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
-    void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t beam_width) override;
-    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
-    using BaseBeamSearchLayer<T>::stream_;
-    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
-    using BaseBeamSearchLayer<T>::allocator_;
-    float* float_log_prob_buf_ = nullptr;
-protected:
-public:
-    BeamSearchLayer(size_t           max_batch_size,
-                    size_t           head_num,
-                    size_t           size_per_head,
-                    size_t           beam_width,
-                    size_t           vocab_size,
-                    size_t           vocab_size_padded,
-                    int              end_id,
-                    float            diversity_rate,
-                    float            temperature,
-                    float            len_penalty,
-                    float            repetition_penalty,
-                    cudaStream_t     stream,
-                    cublasMMWrapper* cublas_wrapper,
-                    IAllocator*      allocator,
-                    bool             is_free_buffer_after_forward);
-    BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer);
-    ~BeamSearchLayer();
-};
-}  // namespace turbomind
--- a/src/turbomind/layers/beam_search_layers/CMakeLists.txt
+++ b/src/turbomind/layers/beam_search_layers/CMakeLists.txt
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8)
-add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
-set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils)
-add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
-set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
-add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
-set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
--- a/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
+++ b/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
-namespace turbomind {
-static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
-static const int MAX_K                             = 4;
-template<typename T>
-__global__ void update_kernel(bool*          finished,
-                              int*           parent_ids,
-                              int*           sequence_length,
-                              int*           word_ids,
-                              int*           output_ids,
-                              BeamHypotheses beam_hyps,
-                              const int      vocab_size,
-                              const int*     end_ids,
-                              const int      local_batch_size,
-                              const int      beam_width)
-{
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
-         index += blockDim.x * gridDim.x) {
-        int batch_id           = index / beam_width;
-        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
-        int beam_id = (word_ids[index] / vocab_size) % beam_width;
-        int word_id = word_ids[index] % vocab_size;
-        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
-        finished[index]        = word_id == end_ids[index / beam_width] ? 1 : 0;
-        parent_ids[index]      = beam_id;
-        word_ids[index]        = word_id;
-        output_ids[index]      = word_id;
-        if (beam_hyps.num_beams != nullptr) {
-            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
-                for (int i = 0; i < beam_width; i++) {
-                    finished[batch_id * beam_width + i] = true;
-                }
-            }
-        }
-    }
-}
-void invokeUpdate(bool*           finished,
-                  int*            parent_ids,
-                  int*            sequence_length,
-                  int*            word_ids,
-                  int*            output_ids,
-                  BeamHypotheses* beam_hyps,
-                  const int       local_batch_size,
-                  const int       beam_width,
-                  const int       vocab_size_padded,
-                  const int*      end_ids,
-                  cudaStream_t    stream)
-{
-    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
-    dim3 block(256);
-    update_kernel<float><<<grid, block, 0, stream>>>(finished,
-                                                     parent_ids,
-                                                     sequence_length,
-                                                     word_ids,
-                                                     output_ids,
-                                                     *beam_hyps,
-                                                     vocab_size_padded,
-                                                     end_ids,
-                                                     local_batch_size,
-                                                     beam_width);
-}
-template<typename T>
-void OnlineBeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
-{
-    // input_tensors:
-    //      logits [local_batch_size, beam_width, vocab_size_padded]
-    //      embedding_bias [vocab_size_padded]
-    //      step [1] on cpu
-    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      max_input_length [1] on cpu
-    //      input_lengths [local_batch_size * beam_width]
-    //      ite [1] on cpu
-    //      beam_search_diversity_rate [1] on cpu, optional
-    //      temperature [1] on cpu, optional
-    //      len_penalty [1] on cpu, optional
-    //      repetition_penalty [1] on cpu, optional
-    // output_tensors:
-    //      output_ids [max_seq_len, batch_size, beam_width]
-    //      finished [local_batch_size * beam_width]
-    //      cum_log_probs [local_batch_size * beam_width]
-    //      parent_ids [max_seq_len, batch_size * beam_width]
-    //      sequence_length [local_batch_size * beam_width]
-    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
-    //      output_log_probs [max_seq_len, batch_size, beam_width]
-    FT_CHECK(input_tensors->size() >= 7);
-    FT_CHECK(output_tensors->size() >= 6);
-    const int   batch_size       = output_tensors->at("output_ids").shape[1];
-    const int   beam_width       = output_tensors->at("output_ids").shape[2];
-    const int   step             = input_tensors->at("step").getVal<int>();
-    const int   ite              = input_tensors->at("ite").getVal<int>();
-    const int   local_batch_size = input_tensors->at("logits").shape[0];
-    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
-                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
-                                       0.0f;
-    const float length_penalty =
-        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
-    const int id_offset = step * batch_size * beam_width + local_batch_size * ite * beam_width;
-    BeamHypotheses beam_hyps;
-    if (output_tensors->isExist("beam_hyps")) {
-        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
-        beam_hyps.step                 = step;
-        beam_hyps.ite                  = ite;
-        beam_hyps.local_batch_size     = local_batch_size;
-        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
-        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
-        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
-        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
-        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
-        beam_hyps.log_probs_src        = output_tensors->getPtr<float>("output_log_probs", nullptr);
-        beam_hyps.length_penalty       = length_penalty;
-        beam_hyps.end_ids              = input_tensors->at("end_id").getPtr<int>();
-    }
-    invokeTopkSoftMax(input_tensors->at("logits").getPtr<T>(),
-                      (const T*)(nullptr),
-                      output_tensors->at("finished").getPtr<bool>(),
-                      output_tensors->at("sequence_length").getPtr<int>(),
-                      output_tensors->at("cum_log_probs").getPtr<float>(),
-                      output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
-                      output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                      topk_softmax_workspace_,
-                      topk_softmax_workspace_size_,
-                      &beam_hyps,
-                      local_batch_size,
-                      beam_width,
-                      vocab_size_padded_,
-                      input_tensors->at("end_id").getPtr<int>(),
-                      diversity_rate,
-                      length_penalty,
-                      stream_);
-    sync_check_cuda_error();
-    invokeUpdate(output_tensors->at("finished").getPtr<bool>(),
-                 output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
-                 output_tensors->at("sequence_length").getPtr<int>(),
-                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
-                 &beam_hyps,
-                 local_batch_size,
-                 beam_width,
-                 vocab_size_padded_,
-                 input_tensors->at("end_id").getPtr<const int>(),
-                 stream_);
-    sync_check_cuda_error();
-}
-template<typename T>
-void OnlineBeamSearchLayer<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-template<typename T>
-void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    // we need to check 2 * beam_width candidates each time
-    // 64 is the max beam width we support now.
-    topk_softmax_workspace_size_ =
-        (size_t)(ceil(batch_size * 64 * (64 * 2) / 4.) * 4 * 2
-                 + ceil(batch_size * (64 * 2) * SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * (MAX_K * 2) + 2) / 4.) * 4);
-    topk_softmax_workspace_ = reinterpret_cast<float*>(
-        allocator_->reMalloc(topk_softmax_workspace_, sizeof(float) * topk_softmax_workspace_size_, true));
-    is_allocate_buffer_ = true;
-}
-template<typename T>
-OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(size_t           max_batch_size,
-                                                size_t           head_num,
-                                                size_t           size_per_head,
-                                                size_t           beam_width,
-                                                size_t           vocab_size,
-                                                size_t           vocab_size_padded,
-                                                int              end_id,
-                                                float            diversity_rate,
-                                                float            temperature,
-                                                float            len_penalty,
-                                                float            repetition_penalty,
-                                                cudaStream_t     stream,
-                                                cublasMMWrapper* cublas_wrapper,
-                                                IAllocator*      allocator,
-                                                bool             is_free_buffer_after_forward):
-    BaseBeamSearchLayer<T>(max_batch_size,
-                           head_num,
-                           size_per_head,
-                           beam_width,
-                           vocab_size,
-                           vocab_size_padded,
-                           end_id,
-                           diversity_rate,
-                           temperature,
-                           len_penalty,
-                           repetition_penalty,
-                           stream,
-                           cublas_wrapper,
-                           allocator,
-                           is_free_buffer_after_forward)
-{
-}
-template<typename T>
-OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
-    BaseBeamSearchLayer<T>(beam_search_layer)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-template<typename T>
-OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
-template class OnlineBeamSearchLayer<float>;
-template class OnlineBeamSearchLayer<half>;
-}  // namespace turbomind
--- a/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
+++ b/src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
-#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
-namespace turbomind {
-template<typename T>
-class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
-private:
-    // meta data
-    using BaseBeamSearchLayer<T>::vocab_size_;
-    using BaseBeamSearchLayer<T>::vocab_size_padded_;
-    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
-    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
-    void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t beam_width) override;
-    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
-    using BaseBeamSearchLayer<T>::stream_;
-    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
-    using BaseBeamSearchLayer<T>::allocator_;
-protected:
-public:
-    OnlineBeamSearchLayer(size_t           max_batch_size,
-                          size_t           head_num,
-                          size_t           size_per_head,
-                          size_t           beam_width,
-                          size_t           vocab_size,
-                          size_t           vocab_size_padded,
-                          int              end_id,
-                          float            diversity_rate,
-                          float            temperature,
-                          float            len_penalty,
-                          float            repetition_penalty,
-                          cudaStream_t     stream,
-                          cublasMMWrapper* cublas_wrapper,
-                          IAllocator*      allocator,
-                          bool             is_free_buffer_after_forward);
-    OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer);
-    ~OnlineBeamSearchLayer();
-};
-}  // namespace turbomind
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC -lcudart
        cublasMMWrapper
        DynamicDecodeLayer
-        BaseBeamSearchLayer
        activation_kernels
        decoder_masked_multihead_attention
        bert_preprocess_kernels

--- a/src/turbomind/models/llama/prefix_cache.cu
+++ b/src/turbomind/models/llama/prefix_cache.cu
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "src/turbomind/models/llama/prefix_cache.h"
-// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
-template<typename T>
-__global__ void insertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, size_t S)
-{
-    for (int i = threadIdx.x; i < L * H * Dx * s * X; i += blockDim.x) {
-        int i0 = i / X;
-        int x  = i % X;
-        int i1 = i0 / s;
-        int t  = i0 % s;
-        size_t j     = (i1 * S + t) * X + x;
-        key_cache[j] = src[i];
-    }
-}
-template<typename T>
-void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st)
-{
-    insertKeyCache<<<1, 512, 0, st>>>(key_cache, src, L, H, Dx, s, X, S);
-}
-template void
-invokeInsertKeyCache(float* key_cache, const float* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
-template void
-invokeInsertKeyCache(half* key_cache, const half* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
-// <L,H,s,D> -> <L,H,S[:s],D>
-template<typename T>
-__global__ void insertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, size_t S)
-{
-    for (int i = threadIdx.x; i < L * H * s * D; i += blockDim.x) {
-        int i0 = i / D;
-        int d  = i % D;
-        int i1 = i0 / s;
-        int t  = i0 % s;
-        size_t j       = (i1 * S + t) * D + d;
-        value_cache[j] = src[i];
-    }
-}
-template<typename T>
-void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st)
-{
-    insertValueCache<<<1, 512, 0, st>>>(value_cache, src, L, H, s, D, S);
-}
-template void
-invokeInsertValueCache(float* value_cache, const float* src, int L, int H, int s, int D, int S, cudaStream_t st);
-template void
-invokeInsertValueCache(half* value_cache, const half* src, int L, int H, int s, int D, int S, cudaStream_t st);