[Fix] Remove unused code to reduce binary size (#181)

* clean-up * fix lint * fix lint

[Fix] Remove unused code to reduce binary size (#181)
* clean-up * fix lint * fix lint
981a4610 · Li Zhang · GitHub · 83697422 · 981a4610 · 981a4610
Unverified Commit 981a4610 authored Jul 31, 2023 by Li Zhang Committed by GitHub Jul 31, 2023
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -299,21 +299,16 @@ endif()
 ########################################

 add_library(transformer-shared SHARED
-  $<TARGET_OBJECTS:BaseBeamSearchLayer>
  $<TARGET_OBJECTS:BaseSamplingLayer>
-  $<TARGET_OBJECTS:BeamSearchLayer>
  $<TARGET_OBJECTS:DynamicDecodeLayer>
  $<TARGET_OBJECTS:llama_fmha>
  $<TARGET_OBJECTS:Llama>
  $<TARGET_OBJECTS:LlamaTritonBackend>
-  $<TARGET_OBJECTS:OnlineBeamSearchLayer>
  $<TARGET_OBJECTS:TopKSamplingLayer>
  $<TARGET_OBJECTS:TopPSamplingLayer>
  $<TARGET_OBJECTS:TransformerTritonBackend>
  $<TARGET_OBJECTS:activation_kernels>
  $<TARGET_OBJECTS:ban_bad_words>
-  $<TARGET_OBJECTS:beam_search_penalty_kernels>
-  $<TARGET_OBJECTS:beam_search_topk_kernels>
  $<TARGET_OBJECTS:bert_preprocess_kernels>
  $<TARGET_OBJECTS:cublasAlgoMap>
  $<TARGET_OBJECTS:cublasMMWrapper>
@@ -329,7 +324,6 @@ add_library(transformer-shared SHARED
  $<TARGET_OBJECTS:mpi_utils>
  $<TARGET_OBJECTS:nccl_utils>
  $<TARGET_OBJECTS:nvtx_utils>
-  $<TARGET_OBJECTS:online_softmax_beamsearch_kernels>
  $<TARGET_OBJECTS:sampling_penalty_kernels>
  $<TARGET_OBJECTS:sampling_topk_kernels>
  $<TARGET_OBJECTS:sampling_topp_kernels>

--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -26,11 +26,6 @@ add_library(activation_kernels STATIC activation_kernels.cu)
 set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

-add_library(gen_relative_pos_bias STATIC gen_relative_pos_bias.cu)
-set_property(TARGET gen_relative_pos_bias PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET gen_relative_pos_bias PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(gen_relative_pos_bias PUBLIC activation_kernels)
-
 add_library(logprob_kernels STATIC logprob_kernels.cu)
 set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
@@ -51,10 +46,6 @@ add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead
 set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

-add_library(online_softmax_beamsearch_kernels STATIC online_softmax_beamsearch_kernels.cu)
-set_property(TARGET online_softmax_beamsearch_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET online_softmax_beamsearch_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-
 add_library(decoding_kernels STATIC decoding_kernels.cu)
 set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
@@ -63,15 +54,6 @@ add_library(gpt_kernels STATIC gpt_kernels.cu)
 set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

-add_library(beam_search_penalty_kernels STATIC beam_search_penalty_kernels.cu)
-set_property(TARGET beam_search_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET beam_search_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(beam_search_penalty_kernels PRIVATE cuda_utils)
-
-add_library(beam_search_topk_kernels STATIC beam_search_topk_kernels.cu)
-set_property(TARGET beam_search_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET beam_search_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-
 add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
 set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)

--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -306,17 +306,17 @@ void invokeGenericActivation(T*           out,
                                                             const int    seq_len,                                     \
                                                             cudaStream_t stream);

-INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
-INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
-#ifdef ENABLE_BF16
-INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
-#endif
-
-INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
-INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
-#ifdef ENABLE_BF16
-INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
-#endif
+// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
+// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
+// #ifdef ENABLE_BF16
+// INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
+// #endif
+
+// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
+// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
+// #ifdef ENABLE_BF16
+// INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
+// #endif

 INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float);
 INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
@@ -324,335 +324,4 @@ INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
 INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16);
 #endif

-INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, float);
-INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, half, half);
-INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, half);
-#ifdef ENABLE_BF16
-INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, __nv_bfloat16, __nv_bfloat16);
-INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, __nv_bfloat16);
-#endif
-#undef INSTANCIATE_GENERIC_ACTIVATION
-
-template<typename T>
-__global__ void add_bias_tanh(T* out, const T* __restrict bias, int m, int n)
-{
-    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
-        T val = out[id];
-        if (bias != nullptr) {
-            val = val + ldg(&bias[id % n]);
-        }
-        out[id] = tanhf(val);
-    }
-}
-
-template<>
-__global__ void add_bias_tanh(half* out, const half* __restrict bias, int m, int n)
-{
-    half2*       out_ptr  = (half2*)out;
-    const half2* bias_ptr = (half2*)bias;
-
-    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
-        half2 val = out_ptr[id];
-        if (bias != nullptr) {
-            val = val + __ldg(&bias_ptr[id % n]);
-        }
-        val.x       = tanhf(val.x);
-        val.y       = tanhf(val.y);
-        out_ptr[id] = val;
-    }
-}
-
-#ifdef ENABLE_BF16
-template<>
-__global__ void add_bias_tanh(__nv_bfloat16* out, const __nv_bfloat16* __restrict bias, int m, int n)
-{
-    __nv_bfloat162*       out_ptr  = (__nv_bfloat162*)out;
-    const __nv_bfloat162* bias_ptr = (__nv_bfloat162*)bias;
-
-    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
-        __nv_bfloat162 val = out_ptr[id];
-        if (bias != nullptr) {
-            val = bf16hadd2(val, ldg(&bias_ptr[id % n]));
-        }
-        val.x       = tanhf(val.x);
-        val.y       = tanhf(val.y);
-        out_ptr[id] = val;
-    }
-}
-#endif
-
-template<typename T>
-void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream)
-{
-    const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16 and bf16
-    dim3      block, grid;
-    if (n / 4 / data_type_factor <= 1024) {
-        block.x = n / 4 / data_type_factor;
-        grid.x  = m;
-    }
-    else {
-        block.x = 1024;
-        grid.x  = ceil(m * n / 1024.);
-    }
-    add_bias_tanh<T><<<grid, block, 0, stream>>>(out, bias, m, n / data_type_factor);
-}
-
-template void invokeAddBiasTanh(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
-template void invokeAddBiasTanh(half* out, const half* bias, const int m, const int n, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void
-invokeAddBiasTanh(__nv_bfloat16* out, const __nv_bfloat16* bias, const int m, const int n, cudaStream_t stream);
-#endif
-
-template<typename T2, int N>
-__global__ void addBiasGeluV2(T2* out,
-                              const T2* __restrict bias,
-                              const int* ia3_tasks,
-                              const T2*  ia3_weights,
-                              const int  size,
-                              const int* padding_offset,
-                              const int  seq_len)
-{
-    const bool with_ia3 = ia3_tasks != nullptr;
-    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
-        T2 val = out[id];
-        if (bias != nullptr) {
-            T2 reg_bias = ldg(&bias[id % N]);
-            val         = hadd2(val, reg_bias);
-        }
-        val = GeluActivation<T2>::apply(val);
-        if (with_ia3) {
-            const int word_id  = id / N;
-            const int offset   = padding_offset == nullptr ? 0 : padding_offset[word_id];
-            const int batch_id = (word_id + offset) / seq_len;
-            const int task     = ia3_tasks[batch_id];
-            val                = val * ia3_weights[task * N + (id % N)];
-        }
-        out[id] = val;
-    }
-}
-
-template<typename T2, int N, int ELEMENT_PER_ROUND>
-__global__ void addBiasGeluV3(T2* out,
-                              const T2* __restrict bias,
-                              const int* ia3_tasks,
-                              const T2*  ia3_weights,
-                              const int  size,
-                              const int* padding_offset,
-                              const int  seq_len)
-{
-    const bool with_ia3 = ia3_tasks != nullptr;
-    T2         buffer[ELEMENT_PER_ROUND];
-    T2         tmp_bias[ELEMENT_PER_ROUND];
-    for (int id = blockIdx.x * blockDim.x * ELEMENT_PER_ROUND + threadIdx.x * ELEMENT_PER_ROUND; id < size;
-         id += blockDim.x * gridDim.x * ELEMENT_PER_ROUND) {
-#pragma unroll
-        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
-            buffer[i] = out[id + i];
-            if (bias != nullptr) {
-                tmp_bias[i] = ldg(&bias[(id + i) % N]);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
-            if (bias != nullptr) {
-                buffer[i] = hadd2(buffer[i], tmp_bias[i]);
-            }
-            buffer[i] = GeluActivation<T2>::apply(buffer[i]);
-            if (with_ia3) {
-                const int word_id  = (id + i) / N;
-                const int offset   = padding_offset == nullptr ? 0 : padding_offset[word_id];
-                const int batch_id = (word_id + offset) / seq_len;
-                const int task     = ia3_tasks[batch_id];
-                buffer[i]          = buffer[i] * ia3_weights[task * N + ((id + i) % N)];
-            }
-            out[id + i] = buffer[i];
-        }
-    }
-}
-
-#define ADD_BIAS_GELU(HALF_N, ELEMENT_PER_ROUND)                                                                       \
-    case HALF_N:                                                                                                       \
-        if (ELEMENT_PER_ROUND > 1) {                                                                                   \
-            grid.x = grid.x / ELEMENT_PER_ROUND;                                                                       \
-            addBiasGeluV3<T2, HALF_N, ELEMENT_PER_ROUND><<<grid, block, 0, stream>>>(                                  \
-                (T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len);          \
-        }                                                                                                              \
-        else {                                                                                                         \
-            addBiasGeluV2<T2, HALF_N><<<grid, block, 0, stream>>>(                                                     \
-                (T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len);          \
-        }                                                                                                              \
-        break;
-
-template<typename T>
-void invokeAddBiasGeluV2(T*           out,
-                         const T*     bias,
-                         const int*   ia3_tasks,
-                         const T*     ia3_weights,
-                         const int*   padding_offset,
-                         const int    seq_len,
-                         const int    m,
-                         const int    n,
-                         cudaStream_t stream)
-{
-    if (n % 2 == 0 && sizeof(T) == 2) {
-        const int half_n = n / 2;
-        dim3      block, grid;
-        block.x  = std::min(half_n, 512);
-        grid.x   = (m * half_n + (block.x - 1)) / block.x;
-        using T2 = typename TypeConverter<T>::Type;
-
-        if (grid.x >= 512) {
-            switch (half_n) {
-                ADD_BIAS_GELU(256, 1)
-                ADD_BIAS_GELU(512, 1)
-                ADD_BIAS_GELU(1024, 1)
-                ADD_BIAS_GELU(1536, 1)
-                ADD_BIAS_GELU(2048, 1)
-                ADD_BIAS_GELU(4096, 2)
-                ADD_BIAS_GELU(8192, 2)
-                ADD_BIAS_GELU(16384, 2)
-                ADD_BIAS_GELU(24576, 2)
-                ADD_BIAS_GELU(40960, 4)
-                default:
-                    invokeGenericActivation<GeluActivation>(out,
-                                                            bias,
-                                                            (T*)nullptr,
-                                                            (T*)nullptr,
-                                                            ia3_tasks,
-                                                            ia3_weights,
-                                                            m,
-                                                            n,
-                                                            0,
-                                                            (float*)nullptr,
-                                                            (float*)nullptr,
-                                                            padding_offset,
-                                                            seq_len,
-                                                            stream);
-                    break;
-            }
-        }
-        else {
-            switch (half_n) {
-                ADD_BIAS_GELU(256, 1)
-                ADD_BIAS_GELU(512, 1)
-                ADD_BIAS_GELU(1024, 1)
-                ADD_BIAS_GELU(1536, 1)
-                ADD_BIAS_GELU(2048, 1)
-                ADD_BIAS_GELU(4096, 1)
-                ADD_BIAS_GELU(8192, 2)
-                ADD_BIAS_GELU(16384, 2)
-                ADD_BIAS_GELU(24576, 2)
-                ADD_BIAS_GELU(40960, 2)
-                default:
-                    invokeGenericActivation<GeluActivation>(out,
-                                                            bias,
-                                                            (T*)nullptr,
-                                                            (T*)nullptr,
-                                                            ia3_tasks,
-                                                            ia3_weights,
-                                                            m,
-                                                            n,
-                                                            0,
-                                                            (float*)nullptr,
-                                                            (float*)nullptr,
-                                                            padding_offset,
-                                                            seq_len,
-                                                            stream);
-                    break;
-            }
-        }
-    }
-    else {
-        invokeGenericActivation<GeluActivation>(out,
-                                                bias,
-                                                (T*)nullptr,
-                                                (T*)nullptr,
-                                                ia3_tasks,
-                                                ia3_weights,
-                                                m,
-                                                n,
-                                                0,
-                                                (float*)nullptr,
-                                                (float*)nullptr,
-                                                padding_offset,
-                                                seq_len,
-                                                stream);
-    }
-}
-
-#undef ADD_BIAS_GELU
-
-template void invokeAddBiasGeluV2(float*       out,
-                                  const float* bias,
-                                  const int*   ia3_tasks,
-                                  const float* ia3_weights,
-                                  const int*   padding_offset,
-                                  const int    seq_len,
-                                  const int    m,
-                                  const int    n,
-                                  cudaStream_t stream);
-template void invokeAddBiasGeluV2(half*        out,
-                                  const half*  bias,
-                                  const int*   ia3_tasks,
-                                  const half*  ia3_weights,
-                                  const int*   padding_offset,
-                                  const int    seq_len,
-                                  const int    m,
-                                  const int    n,
-                                  cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeAddBiasGeluV2(__nv_bfloat16*       out,
-                                  const __nv_bfloat16* bias,
-                                  const int*           ia3_tasks,
-                                  const __nv_bfloat16* ia3_weights,
-                                  const int*           padding_offset,
-                                  const int            seq_len,
-                                  const int            m,
-                                  const int            n,
-                                  cudaStream_t         stream);
-#endif  // ENABLE_BF16
-
-template<typename T>
-__global__ void sigmoid_kernel(T* data, const int size, const float scale)
-{
-    const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (index < size) {
-        float val   = cuda_cast<float>(data[index]);
-        val         = 1.0f / (1.0f + exp(-val)) * scale;
-        data[index] = T(val);
-    }
-}
-
-template<>
-__global__ void sigmoid_kernel(half2* data, const int size, const float scale)
-{
-    const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (index < size / 2) {
-        half2  val        = data[index];
-        float2 val_float2 = cuda_cast<float2>(val);
-        val_float2.x      = 1.0f / (1.0f + exp(-val_float2.x)) * scale;
-        val_float2.y      = 1.0f / (1.0f + exp(-val_float2.y)) * scale;
-        data[index]       = cuda_cast<half2>(val_float2);
-    }
-}
-
-template<typename T>
-void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream)
-{
-    if (std::is_same<T, float>::value || (size % 2 != 0)) {
-        dim3 block(128);
-        dim3 grid((size + 127) / 128);
-        sigmoid_kernel<<<grid, block, 0, stream>>>(data, size, scale);
-    }
-    else {
-        dim3 block(128);
-        dim3 grid((size + 255) / 256);
-        sigmoid_kernel<<<grid, block, 0, stream>>>((half2*)data, size, scale);
-    }
-}
-
-template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
-template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);
-
 }  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_penalty_kernels.cu
+++ b/src/turbomind/kernels/beam_search_penalty_kernels.cu
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-
-#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-
-namespace turbomind {
-
-template<typename T>
-__global__ void add_bias_temperature(T*          logits,
-                                     const T*    bias,
-                                     const int   batch_size,
-                                     const int   beam_width,
-                                     const int   vocab_size,
-                                     const int   vocab_size_padded,
-                                     const float temperature)
-{
-    int tid  = threadIdx.x;
-    int bid  = blockIdx.x;
-    int bbid = blockIdx.y;
-
-    logits += bbid * vocab_size_padded;
-
-    const T MASK_VAL = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
-    const T inv_temp = static_cast<T>(1.0f / (temperature + 1e-6f));
-    for (int i = tid + bid * blockDim.x; i < vocab_size_padded; i += blockDim.x * gridDim.x) {
-        if (i < vocab_size) {
-            T bias_val = bias == nullptr ? (T)(0.0f) : bias[i];
-            logits[i]  = (logits[i] + bias_val) * inv_temp;
-        }
-        else {
-            logits[i] = MASK_VAL;
-        }
-    }
-}
-
-template<>
-__global__ void add_bias_temperature(half2*       logits,
-                                     const half2* bias,
-                                     const int    batch_size,
-                                     const int    beam_width,
-                                     const int    vocab_size,
-                                     const int    vocab_size_padded,
-                                     const float  temperature)
-{
-    assert(vocab_size % 2 == 0);
-    assert(vocab_size_padded % 2 == 0);
-
-    const int tid  = threadIdx.x;
-    const int bid  = blockIdx.x;
-    const int bbid = blockIdx.y;
-
-    const half2 mask_val = __float2half2_rn(-HALF_FLT_MAX);
-    const half2 inv_temp = __float2half2_rn(1.0f / (temperature + 1e-6f));
-
-    const int half_vocab_size        = vocab_size / 2;
-    const int half_vocab_size_padded = vocab_size_padded / 2;
-
-    logits += bbid * half_vocab_size_padded;
-    for (int index = tid + bid * blockDim.x; index < half_vocab_size_padded; index += blockDim.x * gridDim.x) {
-        int   vocab_idx = index % half_vocab_size_padded;
-        half2 logit     = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
-        if (vocab_idx < half_vocab_size) {
-            if (bias != nullptr) {
-                logit = __hadd2(logit, bias[vocab_idx]);
-            }
-            logit = __hmul2(logit, inv_temp);
-        }
-        logits[index] = logit;
-    }
-}
-
-template<typename T, bool IS_ADDITIVE>
-__global__ void apply_repetition_penalty(T*          logits,
-                                         const int   batch_size,
-                                         const int   beam_width,
-                                         const int   vocab_size,
-                                         const int   vocab_size_padded,
-                                         const int   step,
-                                         const int*  current_ids,
-                                         const int*  previous_ids,
-                                         const int*  parent_ids,
-                                         const int*  input_lengths,
-                                         const int   max_input_length,
-                                         const float repetition_penalty)
-{
-    assert(step > 0);
-
-    const int tid      = threadIdx.x;
-    const int bbid     = blockIdx.x;
-    const int batch_id = bbid / beam_width;
-    const int bbsize   = batch_size * beam_width;
-
-    logits += bbid * vocab_size_padded;
-    extern __shared__ char sbuf[];
-    T*                     penalty_logits = reinterpret_cast<T*>(sbuf);
-    // prevent misaligment when sizeof(T) = 2
-    int*      penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * step + 31) / 32 * 32);
-    const int input_length    = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
-    if (tid == 0) {
-        T   repet_penalty         = static_cast<T>(repetition_penalty);
-        int prev_id               = current_ids[bbid];
-        T   prev_logit            = logits[prev_id];
-        penalty_indices[step - 1] = prev_id;
-
-        if (IS_ADDITIVE) {
-            penalty_logits[step - 1] = prev_logit - repet_penalty;
-        }
-        else {
-            penalty_logits[step - 1] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
-        }
-        if (step > 1) {
-            int parent_beam = bbid % beam_width;
-            for (int i = step - 2; i >= 0; --i) {
-                // Skip the padded tokens.
-                if (i >= input_length && i < max_input_length) {
-                    continue;
-                }
-                parent_beam        = parent_ids[i * bbsize + batch_id * beam_width + parent_beam];
-                prev_id            = previous_ids[i * bbsize + batch_id * beam_width + parent_beam];
-                prev_logit         = logits[prev_id];
-                penalty_indices[i] = prev_id;
-                if (IS_ADDITIVE) {
-                    penalty_logits[i] = prev_logit - repet_penalty;
-                }
-                else {
-                    penalty_logits[i] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
-                }
-            }
-        }
-    }
-    __syncthreads();
-    for (int i = tid; i < step; i += blockDim.x) {
-        if (i >= input_length && i < max_input_length) {
-            continue;
-        }
-        logits[penalty_indices[i]] = penalty_logits[i];
-    }
-}
-
-template<typename T>
-__global__ void apply_min_length_penalty(T*         logits,
-                                         const int  min_length,
-                                         const int* end_ids,
-                                         const int* sequence_lengths,
-                                         const int  max_input_length,
-                                         const int  beam_width,
-                                         const int  vocab_size_padded)
-{
-    int bbid = threadIdx.x + blockIdx.x * blockDim.x;  // batch-beam index
-    int bid  = bbid / beam_width;                      // batch index
-    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
-    // which is equal to the length of k/v caches.
-    if (sequence_lengths[bbid] + 1 - max_input_length < min_length) {
-        T mask_val                                      = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
-        logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
-    }
-}
-
-template<typename T>
-void invokeAddBiasApplyPenalties(int                         step,
-                                 T*                          logits,
-                                 const int*                  current_ids,
-                                 const int*                  previous_ids,
-                                 const int*                  parent_ids,
-                                 const int*                  input_lengths,
-                                 const int*                  sequence_lengths,
-                                 const T*                    bias,
-                                 const int                   ite,
-                                 const int                   max_input_length,
-                                 const int                   local_batch_size,
-                                 const int                   batch_size,
-                                 const int                   beam_width,
-                                 const int                   vocab_size,
-                                 const int                   vocab_size_padded,
-                                 const int*                  end_ids,
-                                 const float                 temperature,
-                                 const float                 repetition_penalty,
-                                 const RepetitionPenaltyType repetition_penalty_type,
-                                 const int                   min_length,
-                                 cudaStream_t                stream)
-{
-    if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) {
-        dim3 block(512);
-        if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) {
-            dim3 grid((vocab_size_padded / 2 + block.x - 1) / block.x, beam_width * local_batch_size);
-            add_bias_temperature<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
-                                                             reinterpret_cast<const half2*>(bias),
-                                                             batch_size,
-                                                             beam_width,
-                                                             vocab_size,
-                                                             vocab_size_padded,
-                                                             temperature);
-        }
-        else {
-            dim3 grid((vocab_size_padded + block.x - 1) / block.x, beam_width * local_batch_size);
-            add_bias_temperature<<<grid, block, 0, stream>>>(
-                logits, bias, batch_size, beam_width, vocab_size, vocab_size_padded, temperature);
-        }
-    }
-
-    if (repetition_penalty_type != RepetitionPenaltyType::None && step > 0) {
-        if (repetition_penalty != getDefaultPenaltyValue(repetition_penalty_type)) {
-            size_t smem_size = (sizeof(T) * step + 31) / 32 * 32 + sizeof(int) * step;
-            dim3   block(256);
-            dim3   grid(beam_width * local_batch_size);
-            if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) {
-                apply_repetition_penalty<T, false>
-                    <<<grid, block, smem_size, stream>>>(logits,
-                                                         batch_size,
-                                                         beam_width,
-                                                         vocab_size,
-                                                         vocab_size_padded,
-                                                         step,
-                                                         current_ids,
-                                                         previous_ids,
-                                                         // TODO(jaedeokk):
-                                                         //   Remove (+ite ...) by getting parent_ids with offset
-                                                         //   and then remove 'ite' argument from the function.
-                                                         parent_ids + ite * beam_width * local_batch_size,
-                                                         input_lengths,
-                                                         max_input_length,
-                                                         repetition_penalty);
-            }
-            else if (repetition_penalty_type == RepetitionPenaltyType::Additive) {
-                apply_repetition_penalty<T, true>
-                    <<<grid, block, smem_size, stream>>>(logits,
-                                                         batch_size,
-                                                         beam_width,
-                                                         vocab_size,
-                                                         vocab_size_padded,
-                                                         step,
-                                                         current_ids,
-                                                         previous_ids,
-                                                         parent_ids + ite * beam_width * local_batch_size,
-                                                         input_lengths,
-                                                         max_input_length,
-                                                         repetition_penalty);
-            }
-        }
-    }
-
-    if (step - max_input_length < min_length) {
-        FT_CHECK_WITH_INFO(sequence_lengths != nullptr, "Need sequence_lengths to apply min length penlaty");
-        FT_CHECK_WITH_INFO(end_ids != nullptr, "Need end_id to apply min length penlaty");
-
-        const int block_size = min(local_batch_size * beam_width, 1024);
-        const int grid_size  = (local_batch_size * beam_width + block_size - 1) / block_size;
-        apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
-            logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
-    }
-}
-
-template void invokeAddBiasApplyPenalties(int                         step,
-                                          float*                      logits,
-                                          const int*                  current_ids,
-                                          const int*                  previous_ids,
-                                          const int*                  parent_ids,
-                                          const int*                  input_lengths,
-                                          const int*                  sequence_lengths,
-                                          const float*                bias,
-                                          const int                   ite,
-                                          const int                   max_input_length,
-                                          const int                   local_batch_size,
-                                          const int                   batch_size,
-                                          const int                   beam_width,
-                                          const int                   vocab_size,
-                                          const int                   vocab_size_padded,
-                                          const int*                  end_ids,
-                                          const float                 temperature,
-                                          const float                 repetition_penalty,
-                                          const RepetitionPenaltyType repetition_penalty_type,
-                                          const int                   min_length,
-                                          cudaStream_t                stream);
-
-template void invokeAddBiasApplyPenalties(int                         step,
-                                          half*                       logits,
-                                          const int*                  current_ids,
-                                          const int*                  previous_ids,
-                                          const int*                  parent_ids,
-                                          const int*                  input_lengths,
-                                          const int*                  sequence_lengths,
-                                          const half*                 bias,
-                                          const int                   ite,
-                                          const int                   max_input_length,
-                                          const int                   local_batch_size,
-                                          const int                   batch_size,
-                                          const int                   beam_width,
-                                          const int                   vocab_size,
-                                          const int                   vocab_size_padded,
-                                          const int*                  end_ids,
-                                          const float                 temperature,
-                                          const float                 repetition_penalty,
-                                          const RepetitionPenaltyType repetition_penalty_type,
-                                          const int                   min_length,
-                                          cudaStream_t                stream);
-
-}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_penalty_kernels.h
+++ b/src/turbomind/kernels/beam_search_penalty_kernels.h
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_fp16.h>
-
-#include "src/turbomind/kernels/penalty_types.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-void invokeAddBiasApplyPenalties(int                         step,
-                                 T*                          logits,
-                                 const int*                  current_ids,
-                                 const int*                  previous_ids,
-                                 const int*                  parent_ids,
-                                 const int*                  input_lengths,
-                                 const int*                  sequence_lengths,
-                                 const T*                    bias,
-                                 const int                   ite,
-                                 const int                   max_input_length,
-                                 const int                   local_batch_size,
-                                 const int                   batch_size,
-                                 const int                   beam_width,
-                                 const int                   vocab_size,
-                                 const int                   vocab_size_padded,
-                                 const int*                  end_ids,
-                                 const float                 temperature,
-                                 const float                 repetition_penalty,
-                                 const RepetitionPenaltyType repetition_penalty_type,
-                                 const int                   min_length,
-                                 cudaStream_t                stream);
-
-}  // namespace turbomind
--- a/src/turbomind/kernels/beam_search_topk_kernels.cu
+++ b/src/turbomind/kernels/beam_search_topk_kernels.cu
--- a/src/turbomind/kernels/beam_search_topk_kernels.h
+++ b/src/turbomind/kernels/beam_search_topk_kernels.h
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#pragma once
-
-namespace turbomind {
-
-// In original beam search implementation, if a beam is finished, we set it as finished
-// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
-//
-// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
-// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
-// next step.
-//
-// After we collect `beam_width` beams, we will sort them by their norm_scores.
-struct BeamHypotheses {
-    int*   output_ids_tgt       = nullptr;
-    int*   sequence_lengths_tgt = nullptr;
-    float* cum_log_probs        = nullptr;  // cum_log
-    float* normed_scores        = nullptr;  // cum_log / (length**length_penalty)
-    float* log_probs            = nullptr;  // log probs of each generated token
-    float* min_normed_scores    = nullptr;  // record the min normed scores for each batch
-    int*   num_beams            = nullptr;  // the number of finished beams we collect
-    bool*  is_done              = nullptr;
-
-    // Used to set inputs
-    const int*   output_ids_src;
-    const int*   parent_ids_src;
-    const int*   sequence_lengths_src;
-    const int*   end_ids;
-    const float* log_probs_src;
-
-    // some variables for kernels
-    int   step;
-    int   ite;
-    int   batch_size;
-    int   local_batch_size;
-    int   max_seq_len;
-    float length_penalty;
-
-    bool early_stopping         = true;
-    bool is_return_normed_score = true;  // return normed_cum_log_probs or cum_log_probs
-};
-
-template<typename T>
-void invokeTopkBeamSearch(void*           workspace,
-                          size_t&         workspace_size,
-                          T*              log_probs,
-                          int*            ids,
-                          BeamHypotheses* beam_hyps,
-                          const bool*     finished,
-                          const int*      sequence_lengths,
-                          const int       batch_size,
-                          const int       beam_width,
-                          const int       vocab_size_padded_,
-                          const T         diversity_rate,
-                          const float     length_penalty,
-                          const int*      end_ids,
-                          cudaStream_t    stream);
-
-template<typename T>
-void invokeTileEncoderResults(T*           tiled_encoder_output,
-                              int*         tiled_encoder_sequence_length,
-                              const T*     encoder_output,
-                              const int*   encoder_sequence_length,
-                              const size_t batch_size,
-                              const size_t beam_width,
-                              const size_t mem_max_seq_len,
-                              const size_t d_model,
-                              cudaStream_t stream);
-
-void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
-                                const bool*    finished,
-                                const float*   cum_log_probs,
-                                const int      batch_size,
-                                const int      beam_width,
-                                cudaStream_t   stream);
-
-}  // namespace turbomind
--- a/src/turbomind/kernels/bert_preprocess_kernels.cu
+++ b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -68,120 +68,6 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
    sync_check_cuda_error();
 }

-template<typename T>
-__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int max_seq_len)
-{
-    // sequence_lengths: [batch_size]
-    // attention_mask: [batch_size, 1, max_seq_len, max_seq_len]
-    attention_mask += blockIdx.x * max_seq_len * max_seq_len;
-    const int length = sequence_lengths[blockIdx.x];
-    for (int i = threadIdx.x; i < max_seq_len * max_seq_len; i += blockDim.x) {
-        // int row_id = i / max_seq_len;
-        int col_id = i % max_seq_len;
-        // if (row_id < length && col_id < length) {
-        // TODO (bhsueh) check this modification is ok or not on other rmodel
-        if (col_id < length) {
-            attention_mask[i] = (T)(1.0f);
-        }
-        else {
-            attention_mask[i] = (T)(0.0f);
-        }
-    }
-}
-
-template<typename T>
-void invokeBuildEncoderAttentionMask(
-    T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream)
-{
-    buildEncoderAttentionMaskKernel<<<batch_size, 256, 0, stream>>>(attention_mask, sequence_lengths, max_seq_len);
-}
-
-template void invokeBuildEncoderAttentionMask(float*       attention_mask,
-                                              const int*   sequence_lengths,
-                                              const int    batch_size,
-                                              const int    max_seq_len,
-                                              cudaStream_t stream);
-template void invokeBuildEncoderAttentionMask(half*        attention_mask,
-                                              const int*   sequence_lengths,
-                                              const int    batch_size,
-                                              const int    max_seq_len,
-                                              cudaStream_t stream);
-#ifdef ENABLE_FP8
-template void invokeBuildEncoderAttentionMask(__nv_fp8_e4m3* attention_mask,
-                                              const int*     sequence_lengths,
-                                              const int      batch_size,
-                                              const int      max_seq_len,
-                                              cudaStream_t   stream);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeBuildEncoderAttentionMask(__nv_bfloat16* attention_mask,
-                                              const int*     sequence_lengths,
-                                              const int      batch_size,
-                                              const int      max_seq_len,
-                                              cudaStream_t   stream);
-#endif
-
-__global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size)
-{
-    // use for get tensorrt fused mha padding offset
-    // when we remove the padding
-
-    extern __shared__ int tmp_offset[];
-    if (threadIdx.x == 0) {
-        tmp_offset[0] = 0;
-        for (int i = 0; i < batch_size; i++) {
-            tmp_offset[i + 1] = tmp_offset[i] + sequence_length[i];
-        }
-    }
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < batch_size + 1; i += blockDim.x) {
-        trt_mha_padding_offset[i] = tmp_offset[i];
-    }
-}
-
-void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
-                               const int*   sequence_length,
-                               const int    batch_size,
-                               cudaStream_t stream)
-{
-    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (batch_size + 1), stream>>>(
-        trt_mha_padding_offset, sequence_length, batch_size);
-}
-
-__global__ void getTrtPaddingOffsetKernel(int*       trt_mha_padding_offset,
-                                          const int* sequence_length,
-                                          const int  request_batch_size,
-                                          const int  request_seq_len)
-{
-    // use for get tensorrt fused mha padding offset
-    // when we keep the padding
-
-    extern __shared__ int tmp_offset[];
-    if (threadIdx.x == 0) {
-        tmp_offset[0] = 0;
-        for (int i = 0; i < request_batch_size; i++) {
-            tmp_offset[i * 2 + 1] = tmp_offset[i * 2] + sequence_length[i];
-            tmp_offset[i * 2 + 2] = request_seq_len * (i + 1);
-        }
-    }
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < 2 * request_batch_size + 1; i += blockDim.x) {
-        trt_mha_padding_offset[i] = tmp_offset[i];
-    }
-}
-
-void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
-                               const int*   sequence_length,
-                               const int    request_batch_size,
-                               const int    request_seq_len,
-                               cudaStream_t stream)
-{
-    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (2 * request_batch_size + 1), stream>>>(
-        trt_mha_padding_offset, sequence_length, request_batch_size, request_seq_len);
-}
-
 template<typename T>
 __global__ void rebuild_sequence_length_padding(const T* src, T* dst, const int* padding_offset, const int n)
 {
@@ -287,183 +173,4 @@ template void invokeRemovePadding(__nv_bfloat16*       dst,
                                  cudaStream_t         stream);
 #endif

-template<typename T>
-__global__ void buildRelativeAttentionBias(T*         relative_attention_bias,
-                                           const T*   relative_attention_bias_table,
-                                           const int  head_num,
-                                           const int  seq_len,
-                                           const int  num_bucket,
-                                           const bool is_bidirectional,
-                                           const int  max_distance)
-{
-
-    const int head_id = blockIdx.x;
-    for (int seq_id = threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x) {
-        int row_id = seq_id / seq_len;
-        int col_id = seq_id % seq_len;
-
-        int relative_position = col_id - row_id;
-
-        int relative_buckets = 0;
-        int tmp_num_bucket   = num_bucket;
-        if (is_bidirectional) {
-            tmp_num_bucket /= 2;
-            if (relative_position > 0) {
-                relative_buckets += tmp_num_bucket;
-            }
-            else {
-                relative_position *= -1;
-            }
-        }
-        else {
-            relative_position = abs(relative_position);
-        }
-
-        int  max_exact = tmp_num_bucket / 2;
-        bool is_small  = relative_position < max_exact;
-
-        int relative_position_if_large =
-            max_exact
-            + (int)(logf(relative_position * 1.0f / max_exact) / logf((float)max_distance / max_exact)
-                    * (tmp_num_bucket - max_exact));
-
-        relative_position_if_large = min(relative_position_if_large, tmp_num_bucket - 1);
-
-        relative_buckets += is_small ? relative_position : relative_position_if_large;
-
-        relative_attention_bias[head_id * seq_len * seq_len + seq_id] =
-            relative_attention_bias_table[head_id * num_bucket + relative_buckets];
-    }
-}
-
-template<typename T>
-void invokeBuildRelativeAttentionBias(T*                          relative_attention_bias,
-                                      const T*                    relative_attention_bias_table,
-                                      const int                   head_num,
-                                      const int                   seq_len,
-                                      const int                   num_bucket,
-                                      const bool                  is_bidirectional,
-                                      const int                   max_distance,
-                                      const PositionEmbeddingType position_embedding_type,
-                                      cudaStream_t                stream)
-{
-    if (position_embedding_type == PositionEmbeddingType::absolute) {
-        return;
-    }
-    dim3 grid(head_num);
-    dim3 block(256);
-    buildRelativeAttentionBias<<<grid, block, 0, stream>>>(relative_attention_bias,
-                                                           relative_attention_bias_table,
-                                                           head_num,
-                                                           seq_len,
-                                                           num_bucket,
-                                                           is_bidirectional,
-                                                           max_distance);
-}
-
-template void invokeBuildRelativeAttentionBias(float*                      relative_attention_bias,
-                                               const float*                relative_attention_bias_table,
-                                               const int                   head_num,
-                                               const int                   seq_len,
-                                               const int                   num_bucket,
-                                               const bool                  is_bidirectional,
-                                               const int                   max_distance,
-                                               const PositionEmbeddingType position_embedding_type,
-                                               cudaStream_t                stream);
-
-template void invokeBuildRelativeAttentionBias(half*                       relative_attention_bias,
-                                               const half*                 relative_attention_bias_table,
-                                               const int                   head_num,
-                                               const int                   seq_len,
-                                               const int                   num_bucket,
-                                               const bool                  is_bidirectional,
-                                               const int                   max_distance,
-                                               const PositionEmbeddingType position_embedding_type,
-                                               cudaStream_t                stream);
-
-#ifdef ENABLE_BF16
-template void invokeBuildRelativeAttentionBias(__nv_bfloat16*              relative_attention_bias,
-                                               const __nv_bfloat16*        relative_attention_bias_table,
-                                               const int                   head_num,
-                                               const int                   seq_len,
-                                               const int                   num_bucket,
-                                               const bool                  is_bidirectional,
-                                               const int                   max_distance,
-                                               const PositionEmbeddingType position_embedding_type,
-                                               cudaStream_t                stream);
-#endif
-
-#ifdef ENABLE_FP8
-
-template<typename T_OUT, typename T_IN>
-__global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
-{
-    param.output[blockIdx.x * param.d_model + threadIdx.x] = (T_OUT)(
-        (float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x] * __ldg(param.input_scale));
-}
-
-template<typename T_OUT, typename T_IN>
-void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
-{
-    FT_CHECK(param.d_model <= 1024);
-    getLastTokenDequantize<T_OUT, T_IN><<<param.batch_size, param.d_model, 0, param.stream>>>(param);
-}
-
-template void invokeGetLastTokenDequantize<__nv_bfloat16, __nv_fp8_e4m3>(
-    getLastTokenDequantizeParam<__nv_bfloat16, __nv_fp8_e4m3> param);
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-__global__ void quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
-{
-    for (int i = threadIdx.x; i < param.d_model; i += blockDim.x) {
-        int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : param.padding_offset[blockIdx.x]);
-        if (quantize_mode == QUANTIZE_MODE::PER_TENSOR) {
-            param.dst[padded_row_id * param.d_model + i] =
-                (T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale));
-        }
-        else if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
-            param.dst[padded_row_id * param.d_model + i] =
-                (T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale + i));
-        }
-    }
-}
-
-template<>
-__global__ void
-quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param)
-{
-    int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : __ldg(&param.padding_offset[blockIdx.x]));
-    __nv_fp8x4_e4m3* src_ptr = ((__nv_fp8x4_e4m3*)param.src) + blockIdx.x * (param.d_model / 4);
-    half2*           dst_ptr = ((half2*)param.dst) + padded_row_id * (param.d_model / 2);
-    half2            scale   = cuda_cast<half2>(__ldg(param.scale));
-    for (int i = threadIdx.x; i < param.d_model / 4; i += blockDim.x) {
-        half2 val_0;
-        half2 val_1;
-        fp8x4_e4m3_to_half2(&val_0, &val_1, src_ptr + i);
-
-        val_0 = hmul2(val_0, scale);
-        val_1 = hmul2(val_1, scale);
-
-        dst_ptr[2 * i + 0] = val_0;
-        dst_ptr[2 * i + 1] = val_1;
-    }
-}
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
-{
-    dim3 grid(param.token_num);
-    dim3 block(param.d_model);
-    FT_CHECK(block.x <= 1024);
-    if (block.x % 4 == 0) {
-        block.x /= 4;
-    }
-    quantizeMatrixRebuildPadding<<<grid, block, 0, param.stream>>>(param);
-}
-
-template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR>(
-    QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param);
-
-#endif
-
 }  // namespace turbomind
--- a/src/turbomind/kernels/bert_preprocess_kernels.h
+++ b/src/turbomind/kernels/bert_preprocess_kernels.h
@@ -15,7 +15,6 @@
 */

 #pragma once
-#include "src/turbomind/kernels/gen_relative_pos_bias.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -46,21 +45,6 @@ inline void invokeGetPaddingOffset(size_t*      h_pinned_token_num,
        h_pinned_token_num, h_token_num, tmp_mask_offset, nullptr, sequence_length, batch_size, max_seq_len, stream);
 }

-template<typename T>
-void invokeBuildEncoderAttentionMask(
-    T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream);
-
-void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
-                               const int*   sequence_length,
-                               const int    request_batch_size,
-                               cudaStream_t stream);
-
-void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
-                               const int*   sequence_length,
-                               const int    request_batch_size,
-                               const int    request_seq_len,
-                               cudaStream_t stream);
-
 template<typename T>
 void invokeRebuildPadding(
    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
@@ -69,46 +53,4 @@ template<typename T>
 void invokeRemovePadding(
    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);

-template<typename T>
-void invokeBuildRelativeAttentionBias(T*                          relative_attention_bias,
-                                      const T*                    relative_attention_bias_table,
-                                      const int                   head_num,
-                                      const int                   seq_len,
-                                      const int                   num_bucket,
-                                      const bool                  is_bidirectional,
-                                      const int                   max_distance,
-                                      const PositionEmbeddingType position_embedding_type,
-                                      cudaStream_t                stream);
-
-template<typename T_OUT, typename T_IN>
-struct getLastTokenDequantizeParam {
-    T_OUT* const       output;
-    T_IN const* const  input;
-    float const* const input_scale;
-
-    const int    batch_size;
-    const int    max_seq_len;
-    const int    d_model;
-    cudaStream_t stream;
-};
-
-template<typename T_OUT, typename T_IN>
-void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param);
-
-#ifdef ENABLE_FP8
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-struct QuantizeMatrixRebuildPaddingParam {
-    T_OUT*       dst;
-    const T_IN*  src;
-    const int*   padding_offset;
-    const int    token_num;
-    const int    d_model;
-    const float* scale;
-    cudaStream_t stream;
-};
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
-#endif  // ENABLE_FP8
-
 }  // namespace turbomind
--- a/src/turbomind/kernels/decoder_masked_multihead_attention.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention.h
@@ -16,7 +16,6 @@

 #pragma once

-#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
 #include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <cuda_fp16.h>

--- a/src/turbomind/kernels/decoding_kernels.cu
+++ b/src/turbomind/kernels/decoding_kernels.cu
--- a/src/turbomind/kernels/decoding_kernels.h
+++ b/src/turbomind/kernels/decoding_kernels.h
@@ -22,17 +22,6 @@

 namespace turbomind {

-template<typename T>
-void invokeDecodingInitialize(bool*        finished,
-                              int*         sequence_length,
-                              int*         word_ids,
-                              T*           cum_log_probs,
-                              const int*   sentence_ids,
-                              const int    batch_size,
-                              const int    beam_width,
-                              const int    max_input_length,
-                              cudaStream_t stream);
-
 // get token from all_ids at step, then lookup from the embedding table
 // by the token
 template<typename T>
@@ -99,72 +88,7 @@ void invokePaddingEmbeddingKernel(T*           padded_embedding_kernel,
                                  const int    vocab_size_padded,
                                  cudaStream_t stream);

-void invokeGatherTree(int*         beams,
-                      int*         max_sequence_lengths,
-                      const int    max_time,
-                      const int    batch_size,
-                      const int    beam_width,
-                      const int*   step_ids,
-                      const int*   parent_ids,
-                      const int*   end_tokens,
-                      cudaStream_t stream);
-
-void invokeGatherTree(int*         beams,
-                      int*         max_sequence_lengths,
-                      const int    max_time,
-                      const int    batch_size,
-                      const int    beam_width,
-                      const int*   step_ids,
-                      const int*   parent_ids,
-                      const int*   end_tokens,
-                      const int    max_input_length,
-                      cudaStream_t stream);
-
-struct gatherTreeParam {
-    int*       beams                          = nullptr;
-    int*       max_sequence_lengths           = nullptr;
-    int        max_sequence_length_final_step = 0;
-    const int* input_lengths                  = nullptr;
-    // response input lengths (used to slice the ids during postprocessing)
-    int*       response_input_lengths     = nullptr;
-    int        max_time                   = 0;
-    int        batch_size                 = 0;
-    int        beam_width                 = 0;
-    const int* step_ids                   = nullptr;
-    const int* parent_ids                 = nullptr;
-    const int* end_tokens                 = nullptr;
-    int        max_input_length           = 0;
-    const int* prefix_soft_prompt_lengths = nullptr;
-    // p_prompt_tuning prompt leangths, used to remove prompts during post-processing
-    const int* p_prompt_tuning_prompt_lengths  = nullptr;
-    int        max_input_without_prompt_length = 0;
-    // prefix soft prompt
-    int          max_prefix_soft_prompt_length = 0;
-    int*         output_ids                    = nullptr;
-    cudaStream_t stream;
-};
-
-void invokeGatherTree(gatherTreeParam param);
-
-void invokeMinusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
-void invokePlusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
-
 template<typename T>
 void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream);

-void invokeFinalize(int*         output_ids,
-                    int*         sequence_lengths,
-                    float*       cum_log_probs,
-                    float*       output_log_probs,
-                    const int*   topk_output_ids,
-                    const int*   topk_sequence_lengths,
-                    const float* scores,
-                    const float* topk_cum_log_probs,
-                    const float* topk_log_probs,
-                    const int*   num_beams,
-                    const int    beam_width,
-                    const int    max_seq_len,
-                    const int    batch_size,
-                    cudaStream_t stream);
-
 }  // namespace turbomind
--- a/src/turbomind/kernels/gen_relative_pos_bias.cu
+++ b/src/turbomind/kernels/gen_relative_pos_bias.cu
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublas_v2.h"
-#include "gen_relative_pos_bias.h"
-#include "reduce_kernel_utils.cuh"
-#include "src/turbomind/kernels/activation_kernels.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <cstdio>
-
-namespace turbomind {
-
-/*******************  invokeGenRelativePosBias  ***********************/
-// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
-// relative_position_bias is [head_num, window_size^2, window_size^2]
-// grid(window_size*window_size, head_num)
-// block(window_size*window_size)
-
-template<typename T, typename Tindex>
-__global__ void gen_relative_pos_bias(T*            relative_position_bias,
-                                      const T*      relative_position_bias_table,
-                                      const Tindex* relative_position_bias_index,
-                                      const int     window_size,
-                                      const int     head_num)
-{
-    const int    h_in_window           = blockIdx.x / window_size;
-    const int    w_in_window           = blockIdx.x % window_size;
-    const int    h_in_token            = threadIdx.x / window_size;
-    const int    w_in_token            = threadIdx.x % window_size;
-    const int    head_idx              = blockIdx.y;
-    const int    elements_per_window   = window_size * window_size;
-    const size_t elements_per_window_2 = elements_per_window * elements_per_window;
-    const size_t output_idx = head_idx * elements_per_window_2 + blockIdx.x * elements_per_window + threadIdx.x;
-    if (output_idx < head_num * elements_per_window_2) {
-        const Tindex idx_in_table =
-            relative_position_bias_index[(h_in_window * window_size + w_in_window) * elements_per_window
-                                         + h_in_token * window_size + w_in_token];
-        relative_position_bias[output_idx] = relative_position_bias_table[idx_in_table * head_num + head_idx];
-    }
-}
-
-template<typename T, typename Tindex>
-void invokeGenRelativePosBias(T*            relative_position_bias,
-                              const T*      relative_position_bias_table,
-                              const Tindex* relative_position_bias_index,
-                              const int     window_size,
-                              const int     head_num,
-                              cudaStream_t  stream)
-{
-    dim3 grid(window_size * window_size, head_num);
-    dim3 block(window_size * window_size);
-
-    if (block.x > 1024) {
-        printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
-        exit(-1);
-    }
-
-    gen_relative_pos_bias<<<grid, block, 0, stream>>>(
-        relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
-}
-
-/*******************  invokeGenRelativePosBiasV2  ***********************/
-template<typename T, typename Tindex>
-void invokeGenRelativePosBiasV2(T*            relative_position_bias,
-                                const T*      relative_coords_table,
-                                const Tindex* relative_position_bias_index,
-                                const T*      cpb_mlp_weight1,
-                                const T*      cpb_mlp_bias1,
-                                const T*      cpb_mlp_weight2,
-                                const int     window_size,
-                                const int     cpb_mlp_in_dim,
-                                const int     cpb_mlp_out_dim,
-                                const int     head_num,
-                                cudaStream_t  stream)
-{
-
-    dim3 grid(window_size * window_size, head_num);
-    dim3 block(window_size * window_size);
-
-    if (block.x > 1024) {
-        printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
-        exit(-1);
-    }
-
-    T* relative_position_bias_table;
-    check_cuda_error(cudaMalloc(&relative_position_bias_table,
-                                ((2 * window_size - 1) * (2 * window_size - 1) * head_num) * sizeof(T)));
-    T* cpb_mlp_1;
-    check_cuda_error(
-        cudaMalloc(&cpb_mlp_1, ((2 * window_size - 1) * (2 * window_size - 1) * cpb_mlp_out_dim) * sizeof(T)));
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-
-    int            m     = (2 * window_size - 1) * (2 * window_size - 1);
-    T              alpha = (T)1.0f;
-    T              beta  = (T)0.0f;
-    cudaDataType_t type  = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t compute_type = std::is_same<float, T>::value ? CUBLAS_COMPUTE_32F : CUBLAS_COMPUTE_16F;
-#else
-    cudaDataType_t compute_type = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-#endif
-    cublasGemmAlgo_t algo = std::is_same<float, T>::value ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    check_cuda_error(cublasGemmEx(cublas_handle,
-                                  CUBLAS_OP_T,
-                                  CUBLAS_OP_N,
-                                  cpb_mlp_out_dim,
-                                  m,
-                                  cpb_mlp_in_dim,
-                                  &alpha,
-                                  cpb_mlp_weight1,
-                                  type,
-                                  cpb_mlp_in_dim,
-                                  relative_coords_table,
-                                  type,
-                                  cpb_mlp_in_dim,
-                                  &beta,
-                                  cpb_mlp_1,
-                                  type,
-                                  cpb_mlp_out_dim,
-                                  compute_type,
-                                  algo));
-
-    invokeGenericActivation<ReluActivation, T, T>(
-        cpb_mlp_1, cpb_mlp_bias1, nullptr, nullptr, nullptr, nullptr, m, cpb_mlp_out_dim, 0, nullptr, nullptr, stream);
-
-    check_cuda_error(cublasGemmEx(cublas_handle,
-                                  CUBLAS_OP_T,
-                                  CUBLAS_OP_N,
-                                  head_num,
-                                  m,
-                                  cpb_mlp_out_dim,
-                                  &alpha,
-                                  cpb_mlp_weight2,
-                                  type,
-                                  cpb_mlp_out_dim,
-                                  cpb_mlp_1,
-                                  type,
-                                  cpb_mlp_out_dim,
-                                  &beta,
-                                  relative_position_bias_table,
-                                  type,
-                                  head_num,
-                                  compute_type,
-                                  algo));
-
-    gen_relative_pos_bias<<<grid, block, 0, stream>>>(
-        relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
-
-    invokeSigmoid(
-        relative_position_bias, window_size * window_size * window_size * window_size * head_num, 16.0f, stream);
-    check_cuda_error(cudaFree(relative_position_bias_table));
-    check_cuda_error(cudaFree(cpb_mlp_1));
-    check_cuda_error(cublasDestroy(cublas_handle));
-}
-
-/*******************  instantiation  ***********************/
-
-template void invokeGenRelativePosBias(float*       relative_position_bias,
-                                       const float* relative_position_bias_table,
-                                       const int*   relative_position_bias_index,
-                                       const int    window_size,
-                                       const int    head_num,
-                                       cudaStream_t stream);
-
-template void invokeGenRelativePosBias(half*        relative_position_bias,
-                                       const half*  relative_position_bias_table,
-                                       const int*   relative_position_bias_index,
-                                       const int    window_size,
-                                       const int    head_num,
-                                       cudaStream_t stream);
-
-template void invokeGenRelativePosBias(float*         relative_position_bias,
-                                       const float*   relative_position_bias_table,
-                                       const int64_t* relative_position_bias_index,
-                                       const int      window_size,
-                                       const int      head_num,
-                                       cudaStream_t   stream);
-
-template void invokeGenRelativePosBias(half*          relative_position_bias,
-                                       const half*    relative_position_bias_table,
-                                       const int64_t* relative_position_bias_index,
-                                       const int      window_size,
-                                       const int      head_num,
-                                       cudaStream_t   stream);
-
-__host__ __device__ uint32_t pow2_rounddown(uint32_t x)
-{
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-    x >>= 1;
-    return x + 1;
-}
-
-template<typename T>
-__global__ void generate_alibi_slopes(T* alibi_slopes, const size_t num_heads)
-{
-    if (threadIdx.x < num_heads) {
-        // The nearest power of 2 greater than num_heads followed by HF's implementation.
-        int num_heads_pow2 = pow2_rounddown(num_heads);
-        // Loop over the attention head.
-        for (int h = threadIdx.x; h < num_heads; h += blockDim.x) {
-            if (h < num_heads_pow2) {
-                alibi_slopes[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
-            }
-            else {
-                alibi_slopes[h] = static_cast<T>(
-                    powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
-            }
-        }
-    }
-}
-
-template<typename T>
-void invokeBuildAlibiSlopes(T* alibi_slopes, const size_t num_heads, cudaStream_t stream)
-{
-    // Generate the slopes of a linear attention linear bias.
-    //
-    // Paper: https://arxiv.org/abs/2108.12409
-    // HF's implementation
-    //   https://github.com/huggingface/transformers/blob/56ef0ba44765162f830873c140bd40bdc975cc34/src/transformers/models/bloom/modeling_bloom.py#L86
-    // Author's implementation
-    //   https://github.com/ofirpress/attention_with_linear_biases/blob/02aa87e7a29e9340efd28d6d169018eafb3aa57a/fairseq/models/transformer.py#L760
-    //
-    // alibi_slopes: [num_heads],
-    //     strictly follows how HF implements. which treats power-of-2 heads, and non-power-of-2 heads differently.
-    //     what paper generates differs with HF's when number of heads is not a power of 2.
-    // num_heads: the number of attention heads.
-    // stream: a cuda stream.
-
-    dim3 block(min((int)num_heads, 512));
-    generate_alibi_slopes<<<1, block, 0, stream>>>(alibi_slopes, num_heads);
-}
-
-template void invokeBuildAlibiSlopes(float* alibi_slopes, const size_t num_heads, cudaStream_t stream);
-template void invokeBuildAlibiSlopes(half* alibi_slopes, const size_t num_heads, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeBuildAlibiSlopes(__nv_bfloat16* alibi_slopes, const size_t num_heads, cudaStream_t stream);
-#endif
-
-template void invokeGenRelativePosBiasV2(float*       relative_position_bias,
-                                         const float* relative_coords_table,
-                                         const int*   relative_position_bias_index,
-                                         const float* cpb_mlp_weight1,
-                                         const float* cpb_mlp_bias1,
-                                         const float* cpb_mlp_weight2,
-                                         const int    window_size,
-                                         const int    cpb_mlp_in_dim,
-                                         const int    cpb_mlp_out_dim,
-                                         const int    head_num,
-                                         cudaStream_t stream);
-
-template void invokeGenRelativePosBiasV2(half*        relative_position_bias,
-                                         const half*  relative_coords_table,
-                                         const int*   relative_position_bias_index,
-                                         const half*  cpb_mlp_weight1,
-                                         const half*  cpb_mlp_bias1,
-                                         const half*  cpb_mlp_weight2,
-                                         const int    window_size,
-                                         const int    cpb_mlp_in_dim,
-                                         const int    cpb_mlp_out_dim,
-                                         const int    head_num,
-                                         cudaStream_t stream);
-
-template void invokeGenRelativePosBiasV2(float*         relative_position_bias,
-                                         const float*   relative_coords_table,
-                                         const int64_t* relative_position_bias_index,
-                                         const float*   cpb_mlp_weight1,
-                                         const float*   cpb_mlp_bias1,
-                                         const float*   cpb_mlp_weight2,
-                                         const int      window_size,
-                                         const int      cpb_mlp_in_dim,
-                                         const int      cpb_mlp_out_dim,
-                                         const int      head_num,
-                                         cudaStream_t   stream);
-
-template void invokeGenRelativePosBiasV2(half*          relative_position_bias,
-                                         const half*    relative_coords_table,
-                                         const int64_t* relative_position_bias_index,
-                                         const half*    cpb_mlp_weight1,
-                                         const half*    cpb_mlp_bias1,
-                                         const half*    cpb_mlp_weight2,
-                                         const int      window_size,
-                                         const int      cpb_mlp_in_dim,
-                                         const int      cpb_mlp_out_dim,
-                                         const int      head_num,
-                                         cudaStream_t   stream);
-}  // namespace turbomind
--- a/src/turbomind/kernels/gen_relative_pos_bias.h
+++ b/src/turbomind/kernels/gen_relative_pos_bias.h
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-
-#include <assert.h>
-#include <cuda_runtime.h>
-#include <stdint.h>
-
-namespace turbomind {
-
-enum class PositionEmbeddingType
-{
-    relative,
-    absolute,
-};
-
-template<typename T, typename Tindex>
-void invokeGenRelativePosBias(T*            relative_position_bias,
-                              const T*      relative_position_bias_table,
-                              const Tindex* relative_position_bias_index,
-                              const int     window_size,
-                              const int     head_num,
-                              cudaStream_t  stream);
-
-template<typename T>
-void invokeBuildAlibiSlopes(T* linear_position_bias_slopes, const size_t head_num, cudaStream_t stream);
-
-template<typename T, typename Tindex>
-void invokeGenRelativePosBiasV2(T*            relative_position_bias,
-                                const T*      relative_coords_table,
-                                const Tindex* relative_position_bias_index,
-                                const T*      cpb_mlp_weight1,
-                                const T*      cpb_mlp_bias1,
-                                const T*      cpb_mlp_weight2,
-                                const int     window_size,
-                                const int     cpb_mlp_in_dim,
-                                const int     cpb_mlp_out_dim,
-                                const int     head_num,
-                                cudaStream_t  stream);
-}  // namespace turbomind
--- a/src/turbomind/kernels/logprob_kernels.cu
+++ b/src/turbomind/kernels/logprob_kernels.cu
@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float*       cum_log_probs,
        cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first);
 }

-template void invokeLogProbFromLogits(float*       cum_log_probs,
-                                      const float* logits,
-                                      const int*   input_ids,
-                                      const int*   input_lengths,
-                                      const size_t max_input_length,
-                                      const size_t batch_size,
-                                      const size_t vocab_size,
-                                      const size_t vocab_size_padded,
-                                      void*        workspace,
-                                      const size_t workspace_size,
-                                      cudaStream_t stream,
-                                      const bool   batch_first);
-
-template void invokeLogProbFromLogits(float*       cum_log_probs,
-                                      const half*  logits,
-                                      const int*   input_ids,
-                                      const int*   input_lengths,
-                                      const size_t max_input_length,
-                                      const size_t batch_size,
-                                      const size_t vocab_size,
-                                      const size_t vocab_size_padded,
-                                      void*        workspace,
-                                      const size_t workspace_size,
-                                      cudaStream_t stream,
-                                      const bool   batch_first);
+// template void invokeLogProbFromLogits(float*       cum_log_probs,
+//                                       const float* logits,
+//                                       const int*   input_ids,
+//                                       const int*   input_lengths,
+//                                       const size_t max_input_length,
+//                                       const size_t batch_size,
+//                                       const size_t vocab_size,
+//                                       const size_t vocab_size_padded,
+//                                       void*        workspace,
+//                                       const size_t workspace_size,
+//                                       cudaStream_t stream,
+//                                       const bool   batch_first);
+
+// template void invokeLogProbFromLogits(float*       cum_log_probs,
+//                                       const half*  logits,
+//                                       const int*   input_ids,
+//                                       const int*   input_lengths,
+//                                       const size_t max_input_length,
+//                                       const size_t batch_size,
+//                                       const size_t vocab_size,
+//                                       const size_t vocab_size_padded,
+//                                       void*        workspace,
+//                                       const size_t workspace_size,
+//                                       cudaStream_t stream,
+//                                       const bool   batch_first);
 }  // end of namespace turbomind
--- a/src/turbomind/kernels/online_softmax_beamsearch_kernels.cu
+++ b/src/turbomind/kernels/online_softmax_beamsearch_kernels.cu
--- a/src/turbomind/kernels/online_softmax_beamsearch_kernels.h
+++ b/src/turbomind/kernels/online_softmax_beamsearch_kernels.h
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "src/turbomind/kernels/beam_search_topk_kernels.h"
-
-namespace turbomind {
-
-template<typename T>
-void invokeTopkSoftMax(const T*        log_probs,
-                       const T*        bias,
-                       const bool*     finished,
-                       const int*      sequence_lengths,
-                       float*          cum_log_probs,
-                       float*          output_log_probs,
-                       int*            ids,
-                       void*           tmp_storage,
-                       const int       temp_storage_size,
-                       BeamHypotheses* beam_hyps,
-                       const int       batch_size,
-                       const int       beam_width,
-                       const int       vocab_size,
-                       const int*      end_ids,
-                       const float     diversity_rate,
-                       const float     length_penalty,
-                       cudaStream_t    stream);
-
-}  // namespace turbomind
--- a/src/turbomind/layers/CMakeLists.txt
+++ b/src/turbomind/layers/CMakeLists.txt
@@ -14,13 +14,10 @@

 cmake_minimum_required(VERSION 3.8)

-add_subdirectory(beam_search_layers)
 add_subdirectory(sampling_layers)

 add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
 set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
-                        TopKSamplingLayer TopPSamplingLayer
-                        OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
-                        gpt_kernels tensor nvtx_utils)
+target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer
+        TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
--- a/src/turbomind/layers/DynamicDecodeLayer.cc
+++ b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -17,11 +17,9 @@
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/kernels/ban_bad_words.h"
 #include "src/turbomind/kernels/stop_criteria_kernels.h"
-#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
-#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
-#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
+#include "src/turbomind/utils/cuda_utils.h"

 namespace turbomind {

@@ -45,37 +43,6 @@ template<typename T>
 void DynamicDecodeLayer<T>::initialize()
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0,  // max_batch_size, deprecated
-                                                             0,  // local_head_num, deprecated
-                                                             0,  // size_per_head, deprecated
-                                                             0,  // beam_width, deprecated
-                                                             vocab_size_,
-                                                             vocab_size_padded_,
-                                                             0,     // end_id, deprecated
-                                                             0.0f,  // beam_search_diversity_rate_, deprecated
-                                                             1.0f,  // temperature_, deprecated
-                                                             0.0f,  // len_penalty_, deprecated
-                                                             1.0f,  // repetition_penalty_, deprecated
-                                                             stream_,
-                                                             cublas_wrapper_,
-                                                             allocator_,
-                                                             is_free_buffer_after_forward_);
-
-    beamsearch_decode_ = new BeamSearchLayer<T>(0,  // max_batch_size, deprecated
-                                                0,  // local_head_num, deprecated
-                                                0,  // size_per_head, deprecated
-                                                0,  // beam_width, deprecated
-                                                vocab_size_,
-                                                vocab_size_padded_,
-                                                0,     // end_id, deprecated
-                                                0.0f,  // beam_search_diversity_rate_, deprecated
-                                                1.0f,  // temperature_, deprecated
-                                                0.0f,  // len_penalty_, deprecated
-                                                1.0f,  // repetition_penalty_, deprecated
-                                                stream_,
-                                                cublas_wrapper_,
-                                                allocator_,
-                                                is_free_buffer_after_forward_);

    topk_decode_ = new TopKSamplingLayer<T>(0,
                                            vocab_size_,
@@ -131,8 +98,6 @@ template<typename T>
 DynamicDecodeLayer<T>::~DynamicDecodeLayer()
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    delete online_beamsearch_decode_;
-    delete beamsearch_decode_;
    delete topk_decode_;
    delete topp_decode_;
    freeBuffer();
@@ -284,105 +249,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_

    // dynamic decode GPT
    if (beam_width > 1) {
-        // Because we still not support batch beam search now, so we need to compute one by one if there are different
-        // runtime arguments.
-        const size_t dynamic_decode_batch_size      = has_diff_runtime_args_ ? 1 : local_batch_size;
-        const int    dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
-
-        for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
-             dynamic_ite < (ite + 1) * dynamic_decode_total_iteration;
-             ++dynamic_ite) {
-            const int dynamic_id_offset                      = dynamic_ite * dynamic_decode_batch_size * beam_width;
-            const int dynamic_decode_vocab_size_units_offset = dynamic_id_offset * vocab_size_padded_;
-
-            // common inputs
-            Tensor logits = input_tensors->at("logits");
-            Tensor end_id = input_tensors->at("end_id");
-
-            TensorMap dynamic_decode_input_tensors(
-                {{"logits",
-                  Tensor{logits.where,
-                         logits.type,
-                         {dynamic_decode_batch_size, logits.shape[1], logits.shape[2]},
-                         logits.getPtrWithOffset(dynamic_decode_vocab_size_units_offset)}},
-                 {"step", input_tensors->at("step")},
-                 {"max_input_length", input_tensors->at("max_input_length")},
-                 {"end_id",
-                  Tensor{end_id.where,
-                         end_id.type,
-                         {dynamic_decode_batch_size},
-                         end_id.getPtrWithOffset(dynamic_ite * dynamic_decode_batch_size)}},
-                 {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &dynamic_ite}}});
-
-            if (input_tensors->isExist("embedding_bias")) {
-                dynamic_decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
-            }
-            if (input_tensors->isExist("input_lengths")) {
-                Tensor input_lengths = input_tensors->at("input_lengths");
-                dynamic_decode_input_tensors.insert(
-                    {"input_lengths",
-                     input_lengths.slice({dynamic_decode_batch_size, input_lengths.shape[1]}, dynamic_id_offset)});
-            }
-            for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
-                if (t->first.find("random_seed") == std::string::npos) {
-                    dynamic_decode_input_tensors.insert(*t);
-                }
-            }
-
-            // common outputs
-            TensorMap dynamic_decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
-            if (output_tensors->isExist("sequence_length")) {
-                Tensor sequence_length = output_tensors->at("sequence_length");
-                dynamic_decode_output_tensors.insert({"sequence_length",
-                                                      Tensor{sequence_length.where,
-                                                             sequence_length.type,
-                                                             {dynamic_decode_batch_size * beam_width},
-                                                             sequence_length.getPtrWithOffset(dynamic_id_offset)}});
-            }
-            if (output_tensors->isExist("finished")) {
-                Tensor finished = output_tensors->at("finished");
-                dynamic_decode_output_tensors.insert({"finished",
-                                                      Tensor{finished.where,
-                                                             finished.type,
-                                                             {dynamic_decode_batch_size * beam_width},
-                                                             finished.getPtrWithOffset(dynamic_id_offset)}});
-            }
-            if (output_tensors->isExist("cum_log_probs")) {
-                Tensor cum_log_probs = output_tensors->at("cum_log_probs");
-                dynamic_decode_output_tensors.insert({"cum_log_probs",
-                                                      Tensor{cum_log_probs.where,
-                                                             cum_log_probs.type,
-                                                             {dynamic_decode_batch_size * beam_width},
-                                                             cum_log_probs.getPtrWithOffset(dynamic_id_offset)}});
-            }
-            if (output_tensors->isExist("beam_hyps")) {
-                dynamic_decode_output_tensors.insert("beam_hyps", output_tensors->at("beam_hyps"));
-            }
-
-            if (output_tensors->isExist("output_log_probs")) {
-                dynamic_decode_output_tensors.insert({"output_log_probs", output_tensors->at("output_log_probs")});
-            }
-
-            dynamic_decode_input_tensors.insert({"src_cache_indirection", input_tensors->at("src_cache_indirection")});
-
-            dynamic_decode_output_tensors.insert({"parent_ids", output_tensors->at("parent_ids")});
-            dynamic_decode_output_tensors.insert(
-                {"tgt_cache_indirection", output_tensors->at("tgt_cache_indirection")});
-
-            FT_CHECK_WITH_INFO(dynamic_decode_output_tensors.isExist("cum_log_probs"),
-                               "cum_log_probs should be provided in beam search.");
-
-            if (true || beam_width < 16
-                || (output_tensors->isExist("beam_hyps")
-                    && input_tensors->getVal<float>("beam_search_diversity_rate", 0.0f) != 0.0f)) {
-                // only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
-                online_beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-            }
-            else {
-                FT_CHECK(false);  // deprecate this module
-                beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
-            }
-        }  // end of dynamic_ite
+        FT_CHECK_WITH_INFO(0, "Beam-search is not supported.");
    }
    else {  // beam_width=1
        // In sampling, we have supported batch sampling. So, we always compute all sentences once.

--- a/src/turbomind/layers/DynamicDecodeLayer.h
+++ b/src/turbomind/layers/DynamicDecodeLayer.h
@@ -19,7 +19,6 @@
 #include <string>
 #include <unordered_map>

-#include "src/turbomind/kernels/beam_search_topk_kernels.h"
 #include "src/turbomind/layers/BaseLayer.h"
 #include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
@@ -34,8 +33,6 @@ protected:
    void initialize();
    bool hasDiffRuntimeArgs(TensorMap* input_tensors);

-    DynamicDecodeBaseLayer* online_beamsearch_decode_;
-    DynamicDecodeBaseLayer* beamsearch_decode_;
    DynamicDecodeBaseLayer* topk_decode_;
    DynamicDecodeBaseLayer* topp_decode_;