Unify prefill & decode passes (#775)

* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build

Unify prefill & decode passes (#775)
* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build
7f943a26 · Li Zhang · GitHub · 2ba90822 · 7f943a26 · 7f943a26
Unverified Commit 7f943a26 authored Dec 04, 2023 by Li Zhang Committed by GitHub Dec 04, 2023
20 changed files
--- a/src/turbomind/kernels/bert_preprocess_kernels.cu
+++ b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -48,7 +48,9 @@ __global__ void getPaddingOffsetAndCuSeqLensKernel(size_t*    h_valid_word_num,
    if (calculate_cu_seqlens) {
        cu_seqlens[batch_size] = total_seq_len;
    }
-    h_valid_word_num[0] = (size_t)total_seq_len;
+    if (h_valid_word_num) {
+        h_valid_word_num[0] = (size_t)total_seq_len;
+    }
 }

 void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
@@ -60,15 +62,19 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
                                        const int    max_seq_len,
                                        cudaStream_t stream)
 {
-    h_pinned_token_num[0] = 0;
+    if (h_pinned_token_num) {
+        h_pinned_token_num[0] = 0;
+    }
    getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
        h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len);
+    if (h_pinned_token_num) {
 #ifdef _MSC_VER
-    cudaStreamSynchronize(stream);
+        cudaStreamSynchronize(stream);
 #else
-    while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
+        while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
 #endif
-    h_token_num[0] = h_pinned_token_num[0];
+        h_token_num[0] = h_pinned_token_num[0];
+    }
    sync_check_cuda_error();
 }


--- a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention_params.h
+++ b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention_params.h
@@ -20,13 +20,11 @@ struct DecoderMultiHeadAttentionParams {
    T* __restrict__ v_bias;

    // sequence-level buffers
-    const int* __restrict__ per_sample_length;
+    const int* __restrict__ context_length;
    const bool* __restrict__ finished;
    const float* __restrict__ rope_theta;

    // kv cache
-    void** __restrict__ per_sample_k_cache;  // [H, S, D]
-    void** __restrict__ per_sample_v_cache;  // [H, S, D]
    size_t layer_offset;

    /// cache layout M,[N,H,x,D]

--- a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention_template.h
+++ b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention_template.h
@@ -145,7 +145,7 @@ struct DecoderMultiHeadAttentionKernel {
        kv_head_idx_             = head_idx_ / gqa_group_size;
        is_gqa_leader_           = head_idx_ % gqa_group_size == 0;

-        timestep_ = params_.per_sample_length[batch_idx_];
+        timestep_ = params_.context_length[batch_idx_] - 1;

        if (kSplitK && params.max_split_k > 1) {
            const int slice_count     = (timestep_ + kSliceLen - 1) / kSliceLen;
@@ -815,7 +815,7 @@ struct DecoderMultiHeadAttentionKernel {
    {
        const int batch_idx       = get_batch_idx();
        const int head_idx        = get_head_idx();
-        const int timestep        = params.per_sample_length[batch_idx];
+        const int timestep        = params.context_length[batch_idx] - 1;
        const int max_split_k     = params.max_split_k;
        const int slice_count     = get_slice_count(timestep);
        const int slice_per_split = (slice_count + max_split_k - 1) / max_split_k;

--- a/src/turbomind/kernels/decoder_multihead_attention/test_decoder_multihead_attention.cu
+++ b/src/turbomind/kernels/decoder_multihead_attention/test_decoder_multihead_attention.cu
@@ -53,7 +53,7 @@ void TestBlocks(thrust::universal_vector<half>&  linear,          // linear data
    std::mt19937       g(rd());
    std::shuffle(idxs.begin(), idxs.end(), g);

-    for (int i = 0; i < idxs.size(); ++i) {
+    for (size_t i = 0; i < idxs.size(); ++i) {
        ptrs[i] = blocks.data().get() + idxs[i] * head_num * block_size * head_dim;
    }

@@ -115,8 +115,8 @@ int main(int argc, char* argv[])
    constexpr int KvHeadNum  = 32;
    constexpr int kBatchSize = 1;
    // constexpr int kContextLen = 7306;
-    constexpr int kContextLen  = 1024;
-    constexpr int kSequenceLen = kContextLen + 1;
+    constexpr int kSequenceLen = 1024;
+    constexpr int kContextLen  = kSequenceLen + 1;
    constexpr int kBlockSz     = 128;
    constexpr int kTestIter    = 10;
    constexpr int kMaxSplitK   = 1;
@@ -126,9 +126,10 @@ int main(int argc, char* argv[])
    thrust::universal_vector<half>  output(kBatchSize * kHeadNum * kHeadDim);
    thrust::universal_vector<half>  qkv(kBatchSize * (kHeadNum + KvHeadNum * 2) * kHeadDim);
    thrust::universal_vector<bool>  finished(kBatchSize);
-    thrust::universal_vector<half>  k_cache(kBatchSize * kSequenceLen * KvHeadNum * kHeadDim);
-    thrust::universal_vector<half>  v_cache(kBatchSize * kSequenceLen * KvHeadNum * kHeadDim);
-    thrust::universal_vector<int>   sequence_lengths(kBatchSize);
+    thrust::universal_vector<half>  k_cache(kBatchSize * kContextLen * KvHeadNum * kHeadDim);
+    thrust::universal_vector<half>  v_cache(kBatchSize * kContextLen * KvHeadNum * kHeadDim);
+    thrust::universal_vector<int>   context_length(kBatchSize);
+    thrust::universal_vector<int>   sequence_length(kBatchSize);
    thrust::universal_vector<void*> k_cache_ptrs(kBatchSize);
    thrust::universal_vector<void*> v_cache_ptrs(kBatchSize);

@@ -138,23 +139,23 @@ int main(int argc, char* argv[])

    rng.GenerateNormal(qkv.data().get(), qkv.size(), 1.f, 0.f);

-    if (kContextLen) {
-        rng.GenerateNormal(k_cache.data().get(), kBatchSize * KvHeadNum * kSequenceLen * kHeadDim);
-        rng.GenerateNormal(v_cache.data().get(), kBatchSize * KvHeadNum * kSequenceLen * kHeadDim);
+    if (kSequenceLen) {
+        rng.GenerateNormal(k_cache.data().get(), kBatchSize * KvHeadNum * kContextLen * kHeadDim);
+        rng.GenerateNormal(v_cache.data().get(), kBatchSize * KvHeadNum * kContextLen * kHeadDim);

-        cudaMemset2DAsync(k_cache.data().get() + kContextLen * kHeadDim,
-                          sizeof(half) * kSequenceLen * kHeadDim,
+        cudaMemset2DAsync(k_cache.data().get() + kSequenceLen * kHeadDim,
+                          sizeof(half) * kContextLen * kHeadDim,
                          0,
                          sizeof(half) * kHeadDim,
                          kBatchSize * KvHeadNum);
        if constexpr (0) {
            for (int b = 0; b < kBatchSize; ++b) {
                for (int h = 0; h < KvHeadNum; ++h) {
-                    for (int s = 0; s < kSequenceLen; ++s) {
+                    for (int s = 0; s < kContextLen; ++s) {
                        for (int d = 0; d < kHeadDim; ++d) {
                            std::cout << std::setw(7) << std::setprecision(4) << std::fixed
-                                      << (float)k_cache[b * KvHeadNum * kSequenceLen * kHeadDim
-                                                        + h * kSequenceLen * kHeadDim + s * kHeadDim + d]
+                                      << (float)k_cache[b * KvHeadNum * kContextLen * kHeadDim
+                                                        + h * kContextLen * kHeadDim + s * kHeadDim + d]
                                      << " ";
                        }
                        std::cout << "\n";
@@ -166,8 +167,8 @@ int main(int argc, char* argv[])
            std::exit(0);
        }

-        cudaMemset2DAsync(v_cache.data().get() + kContextLen * kHeadDim,
-                          sizeof(half) * kSequenceLen * kHeadDim,
+        cudaMemset2DAsync(v_cache.data().get() + kSequenceLen * kHeadDim,
+                          sizeof(half) * kContextLen * kHeadDim,
                          0,
                          sizeof(half) * kHeadDim,
                          kBatchSize * KvHeadNum);
@@ -193,7 +194,8 @@ int main(int argc, char* argv[])
    cudaDeviceSynchronize();

    for (int i = 0; i < kBatchSize; ++i) {
-        sequence_lengths[i] = kContextLen;
+        sequence_length[i]  = kSequenceLen;
+        context_length[i]   = kContextLen;
        k_cache_ptrs[i]     = k_cache.data().get() + i * k_cache.size() / kBatchSize;
        v_cache_ptrs[i]     = v_cache.data().get() + i * v_cache.size() / kBatchSize;
        k_cache_ref_ptrs[i] = k_cache_ref.data().get() + i * k_cache_ref.size() / kBatchSize;
@@ -212,7 +214,7 @@ int main(int argc, char* argv[])
    params.stride = (kHeadNum + 2 * KvHeadNum) * kHeadDim;

    params.batch_size    = kBatchSize;
-    params.max_seq_len   = kContextLen + 1;
+    params.max_seq_len   = kSequenceLen;
    params.cu_block_cnts = cu_block_cnts.data().get();

    printf("%d %d\n", (int)k_ptrs.size(), (int)v_ptrs.size());
@@ -220,11 +222,9 @@ int main(int argc, char* argv[])
    params.v_cache_block_ptrs  = (void**)v_ptrs.data().get();
    params.kv_cache_block_size = kBlockSz;

-    params.finished           = finished.data().get();
-    params.per_sample_length  = sequence_lengths.data().get();
-    params.per_sample_k_cache = k_cache_ref_ptrs.data().get();
-    params.per_sample_v_cache = v_cache_ref_ptrs.data().get();
-    params.layer_offset       = 0;
+    params.finished       = finished.data().get();
+    params.context_length = context_length.data().get();
+    params.layer_offset   = 0;

    params.num_heads     = kHeadNum;
    params.num_kv_heads  = KvHeadNum;
@@ -238,8 +238,16 @@ int main(int argc, char* argv[])
    params.partial_M = partial_M.data().get();
    params.partial_O = partial_O.data().get();

+    params.max_split_k = kMaxSplitK;
+    params.arch        = 80;
+
    for (int i = 0; i < kTestIter; ++i) {
-        mmha_ft_reference(params, cudaStream_t{});
+        mmha_ft_reference(params,
+                          (half**)k_cache_ref_ptrs.data().get(),
+                          (half**)v_cache_ref_ptrs.data().get(),
+                          sequence_length.data().get(),
+                          kContextLen,
+                          cudaStream_t{});
    }

    cudaDeviceSynchronize();
@@ -249,14 +257,7 @@ int main(int argc, char* argv[])
    }
    std::cout << "---------------------------------------------------\n";

-    params.out                = output.data().get();
-    params.per_sample_k_cache = k_cache_ptrs.data().get();
-    params.per_sample_v_cache = v_cache_ptrs.data().get();
-
-    params.max_split_k = kMaxSplitK;
-    params.max_seq_len = kContextLen;
-
-    params.arch = 80;
+    params.out = output.data().get();

    std::vector<thrust::universal_vector<half>> outputs;

@@ -271,19 +272,14 @@ int main(int argc, char* argv[])
        }
    }

-    thrust::universal_vector<int> seq_lens(kBatchSize);
-    for (auto& x : seq_lens) {
-        x = kContextLen + 1;
-    }
-
    if (1) {
        ConvertBlocksToLinear((const half**)k_ptrs.data().get(),
                              k_cache.data().get(),
                              cu_block_cnts.data().get(),
-                              seq_lens.data().get(),
+                              context_length.data().get(),
                              0,
                              kBlockSz,
-                              kSequenceLen,
+                              kContextLen,
                              KvHeadNum,
                              kHeadDim,
                              kBatchSize,
@@ -291,10 +287,10 @@ int main(int argc, char* argv[])
        ConvertBlocksToLinear((const half**)v_ptrs.data().get(),
                              v_cache.data().get(),
                              cu_block_cnts.data().get(),
-                              seq_lens.data().get(),
+                              context_length.data().get(),
                              0,
                              kBlockSz,
-                              kSequenceLen,
+                              kContextLen,
                              KvHeadNum,
                              kHeadDim,
                              kBatchSize,
@@ -316,15 +312,15 @@ int main(int argc, char* argv[])

    // [H, S, D]

-    Compare(k_cache.data().get() + kContextLen * kHeadDim,
-            k_cache_ref.data().get() + kContextLen * kHeadDim,
-            kSequenceLen * kHeadDim,
+    Compare(k_cache.data().get() + kSequenceLen * kHeadDim,
+            k_cache_ref.data().get() + kSequenceLen * kHeadDim,
+            kContextLen * kHeadDim,
            kHeadDim,
            KvHeadNum);

-    Compare(v_cache.data().get() + kContextLen * kHeadDim,
-            v_cache_ref.data().get() + kContextLen * kHeadDim,
-            kSequenceLen * kHeadDim,
+    Compare(v_cache.data().get() + kSequenceLen * kHeadDim,
+            v_cache_ref.data().get() + kSequenceLen * kHeadDim,
+            kContextLen * kHeadDim,
            kHeadDim,
            KvHeadNum);


--- a/src/turbomind/kernels/decoder_multihead_attention/test_utils.cu
+++ b/src/turbomind/kernels/decoder_multihead_attention/test_utils.cu
@@ -182,7 +182,12 @@ struct SATypeConverter<half> {
 };

 template<typename T>
-void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t st)
+void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p,
+                       T**                                       per_sample_k_cache,
+                       T**                                       per_sample_v_cache,
+                       const int*                                sequence_length,
+                       int                                       max_memory_len,
+                       cudaStream_t                              st)
 {
    using DataType = typename SATypeConverter<T>::Type;

@@ -204,18 +209,18 @@ void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t
    params.stride   = p.stride;
    params.finished = (bool*)p.finished;

-    params.k_cache_per_sample         = reinterpret_cast<DataType**>(p.per_sample_k_cache);
-    params.v_cache_per_sample         = reinterpret_cast<DataType**>(p.per_sample_v_cache);
+    params.k_cache_per_sample         = reinterpret_cast<DataType**>(per_sample_k_cache);
+    params.v_cache_per_sample         = reinterpret_cast<DataType**>(per_sample_v_cache);
    params.kv_cache_per_sample_offset = p.layer_offset;
    params.batch_size                 = p.batch_size;
    params.beam_width                 = 1;
-    params.memory_max_len             = p.max_seq_len;
+    params.memory_max_len             = max_memory_len;
    params.prefix_prompt_lengths      = 0;
    params.max_prefix_prompt_length   = 0;
-    params.length_per_sample          = p.per_sample_length;  // max_input_length + current output length
+    params.length_per_sample          = sequence_length;  // max_input_length + current output length

    for (int i = 0; i < p.batch_size; ++i) {
-        params.timestep = std::max(p.per_sample_length[i], params.timestep);
+        params.timestep = std::max(sequence_length[i], params.timestep);
    }

    std::cout << "timestep = " << params.timestep << "\n";
@@ -237,6 +242,11 @@ void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t
    masked_multihead_attention(params, st);
 }

-template void mmha_ft_reference(const DecoderMultiHeadAttentionParams<half>& params, cudaStream_t st);
+template void mmha_ft_reference(const DecoderMultiHeadAttentionParams<half>& params,
+                                half**                                       per_sample_k_cache,
+                                half**                                       per_sample_v_cache,
+                                const int*                                   sequence_length,
+                                int                                          max_memory_len,
+                                cudaStream_t                                 st);

 }  // namespace turbomind
--- a/src/turbomind/kernels/decoder_multihead_attention/test_utils.h
+++ b/src/turbomind/kernels/decoder_multihead_attention/test_utils.h
@@ -33,6 +33,11 @@ private:
 };

 template<typename T>
-void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& params, cudaStream_t st);
+void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& params,
+                       T**                                       per_sample_k_cache,
+                       T**                                       per_sample_v_cache,
+                       const int*                                sequence_length,
+                       int                                       max_memory_len,
+                       cudaStream_t                              st);

 }  // namespace turbomind
--- a/src/turbomind/models/llama/Barrier.h
+++ b/src/turbomind/models/llama/Barrier.h
@@ -34,10 +34,11 @@ public:

 class Barrier {
 public:
-    Barrier(unsigned count)
+    Barrier(unsigned count): count_(count)
    {
-        TM_LOG_INFO("Barrier(%d)", (int)count);
-        pthread_barrier_init(&barrier_, nullptr, count);
+        if (count_ > 1) {
+            pthread_barrier_init(&barrier_, nullptr, count);
+        }
    }

    Barrier(const Barrier&) = delete;
@@ -47,15 +48,20 @@ public:

    void wait()
    {
-        pthread_barrier_wait(&barrier_);
+        if (count_ > 1) {
+            pthread_barrier_wait(&barrier_);
+        }
    }

    ~Barrier()
    {
-        pthread_barrier_destroy(&barrier_);
+        if (count_ > 1) {
+            pthread_barrier_destroy(&barrier_);
+        }
    }

 private:
+    const int         count_;
    pthread_barrier_t barrier_{};
 };


--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -9,16 +9,13 @@ find_package(CUDAToolkit REQUIRED)
 add_library(Llama STATIC
        LlamaV2.cc
        LlamaBatch.cc
-        LlamaCacheManager.cc
        BlockManager.cc
        SequenceManager.cc
-        LlamaContextDecoder.cc
-        LlamaContextAttentionLayer.cc
-        LlamaDecoderSelfAttentionLayer.cc
-        LlamaDecoder.cc
        LlamaWeight.cc
        LlamaDecoderLayerWeight.cc
        LlamaFfnLayer.cc
+        unified_decoder.cc
+        unified_attention_layer.cc
        llama_kernels.cu
        llama_decoder_kernels.cu
        llama_utils.cu)

--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -9,6 +9,7 @@
 #include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -28,7 +29,6 @@ struct BatchState {
    float* h_rope_theta;

    std::vector<int> seq_len_limit;
-    std::vector<int> is_swap_in;

    std::vector<const Sequence*>          sequences;
    std::vector<std::shared_ptr<Request>> requests;
@@ -42,6 +42,26 @@ struct BatchState {
 template<typename T>
 class LlamaV2;

+struct GenerationState {
+    int max_init_ctx_len;
+    int step;
+
+    int sum_seq_len;
+    int max_seq_len;
+
+    int partial;
+    int partial_context_legnth;
+
+    std::vector<uint64_t> unique_ids;
+
+    int max_input_count1;
+    int max_input_count2;
+
+    std::deque<int> min_input_count;
+
+    int finished_count;
+};
+
 template<typename T>
 class LlamaBatch {
 public:
@@ -58,35 +78,24 @@ public:

    void ProcessInferRequests(const Requests& requests);

-    [[nodiscard]] bool Initialize();
-
-    void ContextDecode();
+    void AdjustMaxInputCount(GenerationState&                    g,
+                             const std::vector<const Sequence*>& sequences,
+                             const std::vector<int>&             context_length);

-    struct GenerationState {
-        int max_init_ctx_len;
-        int step;
-        int sum_seq_len;
-        int max_seq_len;
-    };
+    void Initialize(GenerationState& g);

-    void InitializeSampling();
+    void InitializeSampling(const GenerationState& g);

-    GenerationState InitializeGeneration();
+    [[nodiscard]] bool Forward(GenerationState& g, int iter);

-    [[nodiscard]] bool Generate(GenerationState& g);
-
-    [[nodiscard]] auto Finish(GenerationState& g, int& finished_count) -> std::vector<Signal>;
+    [[nodiscard]] auto Finish(GenerationState& g) -> std::vector<Signal>;

    [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false);

    void
    OutputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);

-    explicit LlamaBatch(int                              max_batch_size,
-                        int                              max_context_token_num,
-                        int                              session_len,
-                        std::unique_ptr<SequenceManager> sequence_manager,
-                        LlamaV2<T>*                      llama);
+    explicit LlamaBatch(const EngineParams& params, int cache_block_seq_len, int quant_policy, LlamaV2<T>* model);

    ~LlamaBatch()
    {
@@ -177,7 +186,7 @@ private:
 private:
    const int  max_batch_size_;
    const int  max_context_token_num_;
-    const int  session_len_;
+    int        session_len_;
    const int  rank_;
    const bool debug_;
    const int  step_length_;
@@ -201,6 +210,7 @@ private:
    // lengths
    int* input_length_buf_{};    // input + cache missed length
    int* context_length_buf_{};  // history length + input_length
+    int* init_context_length_{};
    // temp buffers used for block->linear kv-cache conversion
    T*     tmp_k_cache_buf_{};
    T*     tmp_v_cache_buf_{};
@@ -228,13 +238,6 @@ private:
    int*      h_end_ids_buf_{};
    int*      d_end_ids_buf_{};

-    int** request_output_ids_ptrs_{};
-    int*  request_output_ids_lens_{};
-    int** request_seqlen_ptrs_{};
-    int** h_request_output_ids_ptrs_{};
-    int*  h_request_output_ids_lens_{};
-    int** h_request_seqlen_ptrs_{};
-
    // pinned buffers
    int*       h_input_ids_buf_{};
    int*       h_input_length_buf_{};
@@ -293,6 +296,10 @@ private:
    bool                    output_stop_token_{false};

    int* h_output_ids_{};
+
+    const int num_tokens_per_iter_;
+    const int extra_tokens_per_iter_;
+    const int max_prefill_iters_;
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaCacheManager.cc
+++ b/src/turbomind/models/llama/LlamaCacheManager.cc
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/models/llama/LlamaCacheManager.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/logger.h"
-
-namespace turbomind {
-
-LlamaCacheManager::~LlamaCacheManager()
-{
-    for (auto& p : device_mem_) {
-        allocator_->free(&p, false);
-    }
-}
-
-void* LlamaCacheManager::allocate(bool is_preallocte)
-{
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][allocate]");
-    }
-
-    void* mem_ptr{};
-
-    if (!device_free_.empty()) {
-        mem_ptr = device_free_.front();
-        device_free_.pop();
-
-        if (rank_ == 0) {
-            TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
-        }
-    }
-    else if (entry_count_ < max_entry_count_) {
-        const auto   alloc_count     = std::min(chunk_size_, max_entry_count_ - entry_count_);
-        const size_t entry_byte_size = 2 * cache_byte_size_;  // 2 for k,v
-
-        if (rank_ == 0) {
-            TM_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
-        }
-        const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
-        FT_CHECK(chunk_ptr);
-        device_mem_.push_back(chunk_ptr);
-        entry_count_ += alloc_count;
-        if (rank_ == 0) {
-            TM_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
-        }
-
-        for (int i = 0; i < alloc_count; ++i) {
-            device_free_.push((uint8_t*)chunk_ptr + entry_byte_size * i);
-        }
-
-        if (!is_preallocte) {
-            mem_ptr = device_free_.front();
-            device_free_.pop();
-        }
-
-        if (rank_ == 0) {
-            TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
-        }
-    }
-    else {
-        mem_ptr = evict();
-        FT_CHECK_WITH_INFO(mem_ptr, "No enough cache entries.");
-    }
-
-    return mem_ptr;
-}
-
-auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
-{
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
-    }
-
-    for (const auto& e : device_cache_) {
-        if (e.id == id) {
-            if (rank_ == 0) {
-                TM_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
-            }
-            erase(id);
-        }
-    }
-
-    const auto mem_ptr = (uint8_t*)allocate(false);
-    check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
-
-    device_cache_.push_back({
-        id,
-        max_seq_len_,
-        {},
-        0,
-        mem_ptr,
-        mem_ptr + cache_byte_size_,
-        {},
-        static_cast<uint64_t>(-1),
-    });
-
-    return device_cache_.back();
-}
-
-auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::iterator
-{
-    auto pred = [&](const Sequence& s) { return s.id == id; };
-    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
-    if (it == device_cache_.end()) {
-        TM_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
-        FT_CHECK(0);
-    }
-    return it;
-}
-
-auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
-{
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
-    }
-
-    auto entry = getEntryOrThrow(id);
-
-    if (entry->k_cache == nullptr) {
-        FT_CHECK(entry->cache_len == 0);
-        const auto mem_ptr = allocate(false);
-        check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
-        entry->k_cache = mem_ptr;
-        entry->v_cache = (uint8_t*)entry->k_cache + cache_byte_size_;
-    }
-
-    entry->timestamp = static_cast<uint64_t>(-1);
-    return *entry;
-}
-
-void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
-{
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
-    }
-
-    auto entry = getEntryOrThrow(seq.id);
-
-    entry->timestamp = ++timestamp_;
-    entry->token_ids = seq.token_ids;
-    entry->cache_len = seq.cache_len;
-    FT_CHECK(seq.k_cache == entry->k_cache && seq.v_cache == entry->v_cache);
-}
-
-void LlamaCacheManager::erase(uint64_t id)
-{
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
-    }
-
-    auto entry = getEntryOrThrow(id);
-
-    if (entry->k_cache) {
-        device_free_.push(entry->k_cache);
-        if (rank_ == 0) {
-            TM_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
-        }
-    }
-    device_cache_.erase(entry);
-}
-
-void* LlamaCacheManager::evict()
-{
-    FT_CHECK(!device_cache_.empty());
-    auto it = std::min_element(device_cache_.begin(), device_cache_.end(), [](const auto& a, const auto& b) {
-        return a.timestamp < b.timestamp;
-    });
-
-    if (it->timestamp == static_cast<uint64_t>(-1)) {
-        return nullptr;
-    }
-
-    if (rank_ == 0) {
-        TM_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
-    }
-
-    FT_CHECK(it->k_cache);
-    auto mem_ptr = it->k_cache;
-    it->k_cache = it->v_cache = nullptr;
-    it->cache_len             = 0;
-    it->timestamp             = static_cast<uint64_t>(-1);
-    return mem_ptr;
-}
-
-bool LlamaCacheManager::contains(uint64_t id) const noexcept
-{
-    auto pred = [&](const Sequence& s) { return s.id == id; };
-    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
-    return it != device_cache_.end();
-}
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaCacheManager.h
+++ b/src/turbomind/models/llama/LlamaCacheManager.h
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/logger.h"
-#include <cstdint>
-#include <cuda_runtime.h>
-#include <queue>
-#include <unordered_map>
-#include <vector>
-
-namespace turbomind {
-
-// k-cache layout [L, H, D/x, S[s:], x]
-// v-cache layout [L, H, S[s:], D/x, x]
-
-class LlamaCacheManager {
-public:
-    LlamaCacheManager(size_t      layer_num,
-                      size_t      head_num,
-                      size_t      size_per_head,
-                      size_t      max_seq_len,
-                      size_t      elem_bits,
-                      size_t      max_entry_count,
-                      size_t      chunk_size,
-                      int         rank,
-                      IAllocator* allocator):
-        layer_num_(layer_num),
-        head_num_(head_num),
-        size_per_head_(size_per_head),
-        max_seq_len_(max_seq_len),
-        elem_bits_(elem_bits),
-        cache_byte_size_(layer_num_ * head_num_ * max_seq_len_ * size_per_head_ * elem_bits_ / 8),
-        max_entry_count_(max_entry_count),
-        chunk_size_(chunk_size),
-        rank_(rank),
-        allocator_(allocator)
-    {
-        if (rank == 0) {
-            TM_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
-            TM_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
-        }
-        allocate(true);
-    }
-
-    ~LlamaCacheManager();
-
-    struct Sequence {
-        // header
-        uint64_t id;
-        size_t   max_seq_len;
-
-        // payloads
-        std::vector<int> token_ids;  // all token ids
-        size_t           cache_len;  // cache_len == 0 -> cache miss
-        void*            k_cache;
-        void*            v_cache;
-
-        std::vector<uint8_t> random_state_;  // states for RNGs
-
-        // for LRU policy
-        uint64_t timestamp;
-    };
-
-    Sequence create(uint64_t id, cudaStream_t stream);
-
-    Sequence fetch(uint64_t id, cudaStream_t stream);
-
-    void update(const Sequence& seq, cudaStream_t stream);
-
-    void erase(uint64_t id);
-
-    bool contains(uint64_t id) const noexcept;
-
-private:
-    std::vector<Sequence>::iterator getEntryOrThrow(uint64_t id);
-
-    void* allocate(bool is_preallocte);
-
-    void* evict();
-
-private:
-    const size_t layer_num_{};
-    const size_t head_num_{};
-    const size_t size_per_head_{};
-    const size_t max_seq_len_{};
-    const size_t elem_bits_{};
-    const size_t cache_byte_size_{};
-    const size_t max_entry_count_{};
-    const size_t chunk_size_{};
-    const int    rank_{};
-    IAllocator*  allocator_{};
-
-    std::queue<void*>  device_free_;
-    std::vector<void*> device_mem_;
-    int                entry_count_{};
-
-    uint64_t timestamp_{};
-
-    std::vector<Sequence> device_cache_;
-};
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
-
-#include "src/turbomind/models/llama/LlamaContextDecoder.h"
-#include "src/turbomind/kernels/bert_preprocess_kernels.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
-#include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/LlamaContextDecoder.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/debug_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-void LlamaContextDecoder<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
-    padding_offset_ = (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * max_q_len, false);
-    cu_seqlens_     = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false);
-
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::freeBuffer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)&padding_offset_);
-        allocator_->free((void**)&cu_seqlens_);
-        allocator_->free((void**)&attention_mask_);
-        allocator_->free((void**)&h_pinned_token_num_ptr_, true);
-        is_allocate_buffer_ = false;
-    }
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
-                                        size_t                      kv_head_num,
-                                        bool                        use_fmha,
-                                        int                         cache_block_seq_len,
-                                        int                         quant_policy)
-{
-    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
-
-    context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
-                                                                 kv_head_num,
-                                                                 size_per_head_,
-                                                                 attn_params,
-                                                                 tensor_para_,
-                                                                 stream_,
-                                                                 cublas_wrapper_,
-                                                                 allocator_,
-                                                                 is_free_buffer_after_forward_,
-                                                                 use_fmha,
-                                                                 cache_block_seq_len,
-                                                                 quant_policy);
-
-    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
-                                           size_per_head_,
-                                           inter_size_,
-                                           tensor_para_,
-                                           stream_,
-                                           cublas_wrapper_,
-                                           allocator_,
-                                           is_free_buffer_after_forward_);
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::forwardSelfAttn(const Session&                                 sess,
-                                             T*                                             attn_io,
-                                             std::unordered_map<std::string, Tensor>*       output_tensors,
-                                             const std::unordered_map<std::string, Tensor>* input_tensors,
-                                             int                                            layer,
-                                             bool                                           is_final)
-{
-    // TM_LOG_ERROR(__PRETTY_FUNCTION__);
-    TensorMap self_attention_input_tensors{
-        {"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
-        {"attention_mask",
-         {MEMORY_GPU, data_type_, {sess.batch_size, 1, sess.max_query_len, sess.max_key_len}, attention_mask_}},
-        {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &layer}},
-        {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_final}},
-        {"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
-        {"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
-        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
-        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
-        {"cu_block_counts", input_tensors->at("cu_block_counts")},
-        {"rope_theta", input_tensors->at("rope_theta")},
-        {"max_seq_len", input_tensors->at("max_seq_len")}};
-
-    TensorMap self_attention_output_tensors{
-        {"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
-        {"key_cache", output_tensors->at("key_cache")},
-        {"value_cache", output_tensors->at("value_cache")},
-        {"tmp_k", output_tensors->at("tmp_k")},
-        {"tmp_v", output_tensors->at("tmp_v")}};
-
-    context_attention_layer_->forward(&self_attention_output_tensors,  //
-                                      &self_attention_input_tensors,
-                                      &sess.weights->at(layer)->self_attn_weights);
-}
-
-template<typename T>
-LlamaContextDecoder<T>::LlamaContextDecoder(size_t                      head_num,
-                                            size_t                      kv_head_num,
-                                            size_t                      size_per_head,
-                                            size_t                      inter_size,
-                                            size_t                      num_layer,
-                                            const LlamaAttentionParams& attn_params,
-                                            float                       rmsnorm_eps,
-                                            NcclParam                   tensor_para,
-                                            cudaStream_t                stream,
-                                            cublasMMWrapper*            cublas_wrapper,
-                                            IAllocator*                 allocator,
-                                            bool                        is_free_buffer_after_forward,
-                                            bool                        use_fmha,
-                                            int                         cache_block_seq_len,
-                                            int                         quant_policy):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    inter_size_(inter_size),
-    hidden_units_(head_num * size_per_head),
-    num_layer_(num_layer),
-    rmsnorm_eps_(rmsnorm_eps),
-    tensor_para_(tensor_para),
-    data_type_(getTensorType<T>())
-{
-    initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
-}
-
-template<typename T>
-LlamaContextDecoder<T>::~LlamaContextDecoder()
-{
-    delete context_attention_layer_;
-    delete silu_ffn_layer_;
-    freeBuffer();
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
-                                     const std::vector<Tensor>*                      input_tensors,
-                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
-                                     const std::unordered_map<std::string, Tensor>*  input_tensors,
-                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
-{
-    /**
-     * input tensors:
-     *   \param decoder_input [num_token, hidden_units], float
-     *   \param input_lengths [batch_size], int
-     *   \param history_lengths [batch_size], int
-     *   \param context_legnths [batch_size], int
-     *   \param output_norm_weight [hidden_dims], float
-     *   \param max_q_len [1], int on cpu
-     *   \param max_kv_len [1], int on cpu
-     *   \param max_seq_len [1], int on cpu
-     *
-     * output tensors:
-     *   \param decoder_output [num_token, hidden_units],
-     *   \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
-     *   \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
-     *   \param last_token_hidden_units [batch_size, hidden_units]
-     */
-
-    Session sess{};
-
-    sess.token_num     = input_tensors->at("decoder_input").shape[0];
-    sess.batch_size    = input_tensors->at("input_lengths").shape[0];
-    sess.max_query_len = input_tensors->at("max_q_len").getVal<int>();
-    sess.max_key_len   = input_tensors->at("max_kv_len").getVal<int>();
-    sess.weights       = decoder_layer_weights;
-
-    sess.input_length   = input_tensors->at("input_lengths").getPtr<int>();
-    sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
-
-    T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
-    T* decoder_output       = output_tensors->at("decoder_output").getPtr<T>();
-
-    allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
-
-    // dbg(padding_offset_);
-    FT_CHECK(padding_offset_);
-
-    size_t tmp_token_num{};
-    invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
-                                       &tmp_token_num,  // updated token num
-                                       padding_offset_,
-                                       cu_seqlens_,
-                                       input_tensors->at("input_lengths").getPtr<int>(),
-                                       sess.batch_size,
-                                       sess.max_query_len,
-                                       stream_);
-    sync_check_cuda_error();
-    dbg(tmp_token_num, sess.token_num);
-    FT_CHECK(tmp_token_num == sess.token_num);
-
-    invokeCreateCausalMasks(attention_mask_,
-                            sess.input_length,
-                            sess.context_length,
-                            sess.max_query_len,
-                            sess.max_key_len,
-                            sess.batch_size,
-                            stream_);
-    sync_check_cuda_error();
-
-    // Compare(
-    //     decoder_input_output, sess.token_num * hidden_units_, Concat("context_decoder_input", 0), kCmpRead, stream_);
-
-    /////////////////////////////////////////////
-    /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input_output,
-                             decoder_layer_weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             sess.token_num,
-                             hidden_units_,
-                             stream_);
-    sync_check_cuda_error();
-
-    for (size_t layer = 0; layer < num_layer_; ++layer) {
-        /////////////////////////////////////////////
-        /// self-attention
-        forwardSelfAttn(sess, decoder_output, output_tensors, input_tensors, layer, false);
-
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
-                                          decoder_output,
-                                          decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
-                                          decoder_layer_weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          sess.token_num,
-                                          hidden_units_,
-                                          stream_);
-        sync_check_cuda_error();
-
-        ////////////////////////////////////////////
-        /// feed-forward network
-        TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
-        TensorMap ffn_outputs{
-            {"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
-        silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
-
-        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
-                                                     input_tensors->at("output_norm_weight").getPtr<T>();
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,  //
-                                          decoder_output,
-                                          decoder_layer_weights->at(layer)->ffn_weights.output.bias,
-                                          scale_weight,
-                                          rmsnorm_eps_,
-                                          sess.token_num,
-                                          hidden_units_,
-                                          stream_);
-        sync_check_cuda_error();
-    }
-
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
-}
-
-template class LlamaContextDecoder<float>;
-template class LlamaContextDecoder<half>;
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextDecoder.h
+++ b/src/turbomind/models/llama/LlamaContextDecoder.h
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.h
-
-#pragma once
-
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/turbomind/models/llama/LlamaFfnLayer.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
-#include "src/turbomind/utils/nccl_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-class LlamaContextDecoder: public BaseLayer {
-protected:
-    void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
-    void freeBuffer() override;
-
-    void initialize(const LlamaAttentionParams& attn_params,
-                    size_t                      kv_head_num,
-                    bool                        use_fmha,
-                    int                         cache_block_seq_len,
-                    int                         quant_policy);
-
-    size_t head_num_;
-    size_t size_per_head_;
-    size_t inter_size_;
-    size_t num_layer_;
-    size_t hidden_units_;
-    float  rmsnorm_eps_;
-
-    NcclParam tensor_para_;
-
-    T*   attention_mask_{};
-    int* padding_offset_{};
-    int* cu_seqlens_{};  // cu for cumulative
-
-    size_t* h_pinned_token_num_ptr_{};
-
-    LlamaContextAttentionLayer<T>* context_attention_layer_{};
-    LlamaFfnLayer<T>*              silu_ffn_layer_{};
-
-    const DataType data_type_;
-
-    struct Session {
-        size_t batch_size;
-        size_t token_num;
-        size_t max_query_len;
-        size_t max_key_len;
-        int*   input_length{};
-        int*   context_length{};
-
-        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
-    };
-
-    void forwardSelfAttn(const Session&                                 sess,
-                         T*                                             attn_io,
-                         std::unordered_map<std::string, Tensor>*       output_tensors,
-                         const std::unordered_map<std::string, Tensor>* input_tensors,
-                         int                                            layer,
-                         bool                                           is_final);
-
-public:
-    LlamaContextDecoder(size_t                      head_num,
-                        size_t                      kv_head_num,
-                        size_t                      size_per_head,
-                        size_t                      inter_size,
-                        size_t                      num_layer,
-                        const LlamaAttentionParams& attn_params,
-                        float                       rmsnorm_eps,
-                        NcclParam                   tensor_para,
-                        cudaStream_t                stream,
-                        cublasMMWrapper*            cublas_wrapper,
-                        IAllocator*                 allocator,
-                        bool                        is_free_buffer_after_forward,
-                        bool                        use_fmha,
-                        int                         cache_block_seq_len,
-                        int                         quant_policy);
-
-    ~LlamaContextDecoder() override;
-
-    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
-                         const std::unordered_map<std::string, Tensor>*  input_tensors,
-                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
-
-    virtual void forward(std::vector<Tensor>*                            output_tensors,
-                         const std::vector<Tensor>*                      input_tensors,
-                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
-};
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- * Copyright (c) 2022, SK Telecom Authored by A. Dialog
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
-
-#include "src/turbomind/models/llama/LlamaDecoder.h"
-#include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/models/llama/llama_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-LlamaDecoder<T>::LlamaDecoder(size_t                      head_num,
-                              size_t                      kv_head_num,
-                              size_t                      size_per_head,
-                              size_t                      inter_size,
-                              size_t                      num_layer,
-                              const LlamaAttentionParams& attn_params,
-                              float                       rmsnorm_eps,
-                              NcclParam                   tensor_para,
-                              cudaStream_t                stream,
-                              cublasMMWrapper*            cublas_wrapper,
-                              IAllocator*                 allocator,
-                              bool                        is_free_buffer_after_forward,
-                              int                         cache_block_seq_len,
-                              int                         quant_policy):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    inter_size_(inter_size),
-    num_layer_(num_layer),
-    hidden_units_(head_num * size_per_head),
-    rmsnorm_eps_(rmsnorm_eps),
-    tensor_para_(tensor_para),
-    data_type_(getTensorType<T>())
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize(attn_params, kv_head_num, cache_block_seq_len, quant_policy);
-}
-
-template<typename T>
-LlamaDecoder<T>::~LlamaDecoder()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    delete self_attention_layer_;
-    delete silu_ffn_layer_;
-}
-
-template<typename T>
-void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
-                                 size_t                      kv_head_num,
-                                 int                         cache_block_seq_len,
-                                 int                         quant_policy)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
-                                                                  kv_head_num,
-                                                                  size_per_head_,
-                                                                  attn_params,
-                                                                  tensor_para_,
-                                                                  stream_,
-                                                                  cublas_wrapper_,
-                                                                  allocator_,
-                                                                  is_free_buffer_after_forward_,
-                                                                  cache_block_seq_len,
-                                                                  quant_policy);
-
-    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
-                                           size_per_head_,
-                                           inter_size_,
-                                           tensor_para_,
-                                           stream_,
-                                           cublas_wrapper_,
-                                           allocator_,
-                                           is_free_buffer_after_forward_);
-}
-
-template<typename T>
-void LlamaDecoder<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LlamaDecoder<T>::freeBuffer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (is_allocate_buffer_) {
-        is_allocate_buffer_ = false;
-    }
-}
-
-template<typename T>
-void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&                   sess,
-                                      T*                                             attn_io,
-                                      const std::unordered_map<std::string, Tensor>* input_tensors,
-                                      size_t                                         layer)
-{
-    NvtxScope scope("self_attn");
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    TensorMap self_attention_input_tensors(*input_tensors);
-    self_attention_input_tensors.insert("input_query",
-                                        {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
-    const int layer_id = layer;
-    self_attention_input_tensors.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
-    auto& k_cache = *sess.k_cache;
-    auto& v_cache = *sess.v_cache;
-
-    TensorMap self_attention_output_tensors{
-        {"attention_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}},
-        {"key_cache", k_cache},
-        {"value_cache", v_cache},
-    };
-
-    self_attention_layer_->forward(&self_attention_output_tensors,  //
-                                   &self_attention_input_tensors,
-                                   &sess.weights->at(layer)->self_attn_weights);
-}
-
-template<typename T>
-void LlamaDecoder<T>::forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer)
-{
-    TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
-    TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
-    silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &sess.weights->at(layer)->ffn_weights);
-}
-
-template<typename T>
-void LlamaDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
-                              const std::vector<Tensor>*                      input_tensors,
-                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
-                              const std::unordered_map<std::string, Tensor>*  input_tensors,
-                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    /**
-     * input_tensors:
-     *   \param decoder_input [batch_size, hidden_dims]
-     *   \param sequence_lengths [batch_size] int
-     *   \param output_norm_weight [hidden_dims]
-     *   \param step [1] on cpu
-     *   \param ite [1] on cpu
-     *   \param finished [batch_size] bool
-     *   \param total_padding_tokens [batch_size], int
-     *   \param max_seq_len [1] on cpu
-     *   \param masked_tokens [batch_size, memory_len] bool (optional), NOT USED YET
-     *
-     * output_tensors:
-     *   \param decoder_output [batch_size, hidden_dimension]
-     *   \param key_cache [batch_size] uint64_t
-     *   \param value_cache [batch_size] uint64_t
-     */
-
-    // for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
-
-    NvtxScope forward_scope("decoder_forward");
-
-    Session sess{};
-    sess.batch_size = input_tensors->at("decoder_input").shape[0];
-    sess.weights    = decoder_layer_weights;
-
-    allocateBuffer(sess.batch_size);
-
-    sess.k_cache = &output_tensors->at("key_cache");
-    sess.v_cache = &output_tensors->at("value_cache");
-
-    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
-    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
-
-    int step = input_tensors->at("step").getVal<int>();
-    // Compare(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0, step), kCmpRead, stream_);
-
-    ////////////////////////////////////////////
-    /// RMSNorm
-    {
-        NvtxScope rms_norm_scope("rms_norm_0");
-        invokeRootMeanSquareNorm(decoder_output,
-                                 decoder_input,
-                                 decoder_layer_weights->at(0)->self_attn_norm_weights,
-                                 rmsnorm_eps_,
-                                 sess.batch_size,
-                                 hidden_units_,
-                                 stream_);
-        sync_check_cuda_error();
-    }
-
-    for (size_t layer = 0; layer < num_layer_; ++layer) {
-        NvtxScope layer_scope("decode_layer");
-
-        // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
-        forwardSelfAttn(sess, decoder_output, input_tensors, layer);
-
-        {
-            NvtxScope rms_norm_scope("rms_norm_1");
-            invokeFusedAddBiasResidualRMSNorm(decoder_input,
-                                              decoder_output,
-                                              decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
-                                              decoder_layer_weights->at(layer)->ffn_norm_weights,
-                                              rmsnorm_eps_,
-                                              sess.batch_size,
-                                              hidden_units_,
-                                              stream_);
-            sync_check_cuda_error();
-        }
-
-        // decoder_layer_output_ = ffn(decoder_normed_input_)
-        forwardFfn(sess, decoder_output, layer);
-
-        {
-            NvtxScope rms_norm_scope("rms_norm_2");
-            auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
-                                                         input_tensors->at("output_norm_weight").getPtr<T>();
-            invokeFusedAddBiasResidualRMSNorm(decoder_input,  //
-                                              decoder_output,
-                                              decoder_layer_weights->at(layer)->ffn_weights.output.bias,
-                                              scale_weight,
-                                              rmsnorm_eps_,
-                                              sess.batch_size,
-                                              hidden_units_,
-                                              stream_);
-            sync_check_cuda_error();
-        }
-    }
-
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
-}
-
-template class LlamaDecoder<half>;
-template class LlamaDecoder<float>;
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoder.h
+++ b/src/turbomind/models/llama/LlamaDecoder.h
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- * Copyright (c) 2022, SK Telecom Authored by A. Dialog
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h
-
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
-#include "src/turbomind/models/llama/LlamaFfnLayer.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
-#include "src/turbomind/utils/nccl_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-class LlamaDecoder: public BaseLayer {
-protected:
-    void allocateBuffer() override;  // deprecated
-    void allocateBuffer(size_t batch_size);
-    void freeBuffer() override;
-    void
-    initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);
-
-    size_t head_num_;
-    size_t size_per_head_;
-    size_t inter_size_;
-    size_t num_layer_;
-    size_t hidden_units_;
-    float  rmsnorm_eps_;
-
-    NcclParam tensor_para_;
-
-    LlamaDecoderSelfAttentionLayer<T>* self_attention_layer_{};
-    LlamaFfnLayer<T>*                  silu_ffn_layer_{};
-
-    const DataType data_type_;
-
-    struct Session {
-        size_t                                          batch_size;
-        Tensor*                                         k_cache;
-        Tensor*                                         v_cache;
-        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
-    };
-
-    void forwardSelfAttn(const Session&                                 sess,
-                         T*                                             attn_io,
-                         const std::unordered_map<std::string, Tensor>* input_tensors,
-                         size_t                                         layer);
-
-    void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);
-
-public:
-    LlamaDecoder(size_t                      head_num,
-                 size_t                      kv_head_num,
-                 size_t                      size_per_head,
-                 size_t                      inter_size,
-                 size_t                      num_layer,
-                 const LlamaAttentionParams& attn_params,
-                 float                       rmsnorm_eps,
-                 NcclParam                   tensor_para,
-                 cudaStream_t                stream,
-                 cublasMMWrapper*            cublas_wrapper,
-                 IAllocator*                 allocator,
-                 bool                        is_free_buffer_after_forward,
-                 int                         cache_block_seq_len,
-                 int                         quant_policy);
-
-    ~LlamaDecoder() override;
-
-    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
-                         const std::unordered_map<std::string, Tensor>*  input_tensors,
-                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
-
-    virtual void forward(std::vector<Tensor>*                            output_tensors,
-                         const std::vector<Tensor>*                      input_tensors,
-                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
-};
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
-#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
-#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
-#include "src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.h"
-#include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/LlamaNcclGuard.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/nvtx_utils.h"
-#include <string>
-// #include <glog/logging.h>
-
-namespace turbomind {
-
-template<typename T>
-void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    const size_t local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
-
-    qkv_buf_ = reinterpret_cast<T*>(
-        allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * local_q_kv_head_num * size_per_head_, false));
-    context_buf_ =
-        reinterpret_cast<T*>(allocator_->reMalloc(context_buf_, sizeof(T) * batch_size * local_hidden_units_, false));
-
-    workspace_ = (float*)allocator_->reMalloc(
-        workspace_, sizeof(float) * batch_size * local_head_num_ * kMaxSplitK * (size_per_head_ + 2), false);
-
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
-{
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)(&qkv_buf_));
-        allocator_->free((void**)(&context_buf_));
-        is_allocate_buffer_ = false;
-    }
-}
-
-template<typename T>
-void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     output_tensors,
-                                                const TensorMap*               input_tensors,
-                                                const LlamaAttentionWeight<T>* weights)
-{
-    /**
-     * input tensors:
-     *    \param input_query [batch_size, hidden_units],
-     *    \param sequence_lengths [batch_size]
-     *    \param step [1] on cpu
-     *    \param finished [batch_size]
-     *    \param total_padding_tokens [batch_size]
-     *    \param layer_id [1], int on cpu
-     *    \param max_seq_len [1] on cpu
-     *    \param masked_tokens [batch_size, memory_len], (optional), NOT USED YET
-     *    \param cache_indirection [batch_size / beam_width, beam_width, memory_max_len] (optional)
-     *
-     * output tensors:
-     *    \param attention_output [batch_size, hidden_units],
-     *    \param key_cache [batch, local_head_num, memory_max_len, size_per_head]
-     *    \param value_cache [batch, local_head_num, memory_max_len, size_per_head]
-     */
-
-    const T*    input_query_data      = input_tensors->getPtr<T>("input_query");
-    const int*  sequence_lengths_data = input_tensors->getPtr<int>("sequence_lengths");
-    const bool* finished_data         = input_tensors->getPtr<bool>("finished");
-
-    const int sum_seq_len = input_tensors->getVal<int>("sum_seq_len");
-    const int max_seq_len = input_tensors->getVal<int>("max_seq_len");
-
-    T*  hidden_features_data = output_tensors->getPtr<T>("attention_output");
-    T** key_cache_ptrs       = output_tensors->getPtr<T*>("key_cache");
-    T** value_cache_ptrs     = output_tensors->getPtr<T*>("value_cache");
-
-    int* cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
-
-    const int layer_id = input_tensors->getVal<int>("layer_id");
-
-    const int step = input_tensors->getVal<int>("step");
-    // const int step_1 = step - 1;
-
-    const int batch_size = input_tensors->at("input_query").shape[0];
-
-    const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);
-
-    allocateBuffer(batch_size);
-
-    // for (int i = 0; i < batch_size; ++i) {
-    //     if (gSequenceIds(i) == 1) {
-    //         Compare((T*)input_query_data + hidden_units_ * i,
-    //                 hidden_units_,
-    //                 Concat("query", gSequenceIds(i), seqlens[i], layer_id),
-    //                 compare_mode,
-    //                 stream_);
-    //     }
-    // }
-
-    {
-        NvtxScope scope("qkv_gemm");
-        linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
-    }
-
-    // if (layer_id == 0) {
-    //     Compare(qkv_buf_, batch_size * 3 * hidden_units_, Concat("qkv_buf", step, layer_id), kCmpRead, stream_);
-    // }
-
-    const auto layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
-    // const int  memory_len   = max_seq_len;
-
-    DecoderMultiHeadAttentionParams<T> params{};
-
-    params.out    = context_buf_;
-    params.q      = qkv_buf_;
-    params.k      = params.q + local_head_num_ * size_per_head_;
-    params.v      = params.k + local_kv_head_num_ * size_per_head_;
-    params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
-
-    params.q_bias = weights->qkv.bias;
-    params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
-    params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
-
-    params.batch_size    = batch_size;
-    params.cu_block_cnts = cu_block_counts;
-
-    params.k_cache_block_ptrs  = (void**)key_cache_ptrs;
-    params.v_cache_block_ptrs  = (void**)value_cache_ptrs;
-    params.kv_cache_block_size = kv_cache_block_len_;
-
-    params.finished          = finished_data;
-    params.per_sample_length = sequence_lengths_data;
-    params.rope_theta        = rope_theta;
-
-    params.layer_offset = layer_offset;
-
-    params.num_heads     = local_head_num_;
-    params.num_kv_heads  = local_kv_head_num_;
-    params.size_per_head = size_per_head_;
-    params.inv_sqrt_dh   = 1.f / std::sqrt((float)params.size_per_head);
-
-    params.rotary_embedding_dim    = size_per_head_;
-    params.rotary_embedding_base   = params_.rotary_embedding_base;
-    params.max_position_embeddings = params_.max_position_embeddings;
-    // params.use_dynamic_ntk = params_.use_dynamic_ntk;
-    params.use_logn_attn = params_.use_logn_attn;
-
-    params.partial_O = workspace_;
-    params.partial_M = params.partial_O + batch_size * local_head_num_ * kMaxSplitK * size_per_head_;
-    params.partial_L = params.partial_M + batch_size * local_head_num_ * kMaxSplitK;
-
-    // avg_batch_size = sum_seq_len / max_seq_len
-    // max_split_k    = kMaxSplitK  / avg_batch_size
-    // max_split_k'   = min(max_split_k, max_seq_lens / kSliceLen)
-
-    const float avg_batch_size = max_seq_len ? (float)sum_seq_len / max_seq_len : 1;
-    FT_CHECK(avg_batch_size >= 1.f);
-
-    const int max_split_k = std::max(1, (int)std::ceil(kMaxSplitK / avg_batch_size));
-
-    // if (layer_id == 0) {
-    //     TM_LOG_INFO("avg_batch_size = %.1f, max_split_k = %d", avg_batch_size, max_split_k);
-    // }
-
-    params.max_split_k = max_split_k;
-    params.max_seq_len = max_seq_len;
-
-    params.arch   = arch_;
-    params.stream = stream_;
-
-    params.quant_policy = quant_policy_;
-    std::copy(weights->past_kv_scale.begin(), weights->past_kv_scale.end(), std::begin(params.kv_quant_params));
-
-    {
-        NvtxScope scope("decoder_multihead_attention");
-        DispatchDecoderMultiheadAttention<T>(params);
-    }
-
-    // for (int i = 0; i < batch_size; ++i) {
-    //     if (gSequenceIds(i) == 1) {
-    //         Compare((T*)context_buf_ + hidden_units_ * i,
-    //                 hidden_units_,
-    //                 Concat("context_buf", gSequenceIds(i), seqlens[i], layer_id),
-    //                 compare_mode,
-    //                 stream_);
-    //     }
-    // }
-
-    // if (layer_id == 0) {
-    //     Compare(context_buf_, batch_size * hidden_units_, Concat("context_buf", step, layer_id), kCmpRead, stream_);
-    // }
-
-    {
-        NvtxScope scope("o_gemm");
-        linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
-    }
-
-    if (tensor_para_.world_size_ > 1) {
-        NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(
-            hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
-        sync_check_cuda_error();
-    }
-
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
-
-    // LOG(WARNING);
-}
-
-template class LlamaDecoderSelfAttentionLayer<float>;
-template class LlamaDecoderSelfAttentionLayer<half>;
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h
-
-#pragma once
-
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/LlamaLinear.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/nccl_utils.h"
-
-namespace turbomind {
-
-template<typename T>
-class LlamaDecoderSelfAttentionLayer {
-public:
-    void freeBuffer();
-    void allocateBuffer(size_t batch_size);
-
-    LlamaDecoderSelfAttentionLayer(size_t                      head_num,
-                                   size_t                      kv_head_num,
-                                   size_t                      size_per_head,
-                                   const LlamaAttentionParams& attn_params,
-                                   NcclParam                   tensor_para,
-                                   cudaStream_t                stream,
-                                   cublasMMWrapper*            cublas_wrapper,
-                                   IAllocator*                 allocator,
-                                   bool                        is_free_buffer_after_forward,
-                                   int                         cache_block_seq_len,
-                                   int                         quant_policy):
-        head_num_(head_num),
-        kv_head_num_(kv_head_num),
-        size_per_head_(size_per_head),
-        hidden_units_(head_num * size_per_head),
-        local_head_num_(head_num / tensor_para.world_size_),
-        local_kv_head_num_(kv_head_num_ / tensor_para.world_size_),
-        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
-        params_(attn_params),
-        tensor_para_(tensor_para),
-        stream_(stream),
-        linear_(cublas_wrapper, stream),
-        allocator_(allocator),
-        kv_cache_block_len_(cache_block_seq_len),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward),
-        quant_policy_(quant_policy)
-    {
-        arch_ = getSMVersion();
-    }
-
-    ~LlamaDecoderSelfAttentionLayer()
-    {
-        freeBuffer();
-    }
-
-    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
-
-private:
-    const size_t head_num_;
-    const size_t kv_head_num_;
-    const size_t size_per_head_;
-    const size_t hidden_units_;
-    const size_t local_head_num_;
-    const size_t local_kv_head_num_;
-    const size_t local_hidden_units_;
-    const size_t kv_cache_block_len_;
-    const bool   is_free_buffer_after_forward_;
-    const int    quant_policy_;
-
-    const LlamaAttentionParams& params_;
-
-    NcclParam tensor_para_;
-
-    cudaStream_t   stream_;
-    IAllocator*    allocator_;
-    LlamaLinear<T> linear_;
-
-    T* qkv_buf_     = nullptr;
-    T* context_buf_ = nullptr;
-
-    static constexpr int kMaxSplitK = 16;  // must be <= WARP_SIZE
-    float*               workspace_ = nullptr;
-
-    bool is_allocate_buffer_{};
-    int  arch_{};
-};
-
-}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -31,6 +31,7 @@
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
@@ -47,19 +48,14 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
                    size_t                       inter_size,
                    size_t                       num_layer,
                    size_t                       vocab_size,
-                    const LlamaAttentionParams&  attn_params,
                    float                        norm_eps,
-                    int                          max_batch_size,
-                    int                          max_context_token_num,
-                    int                          session_len,
-                    int                          step_length,
+                    const LlamaAttentionParams&  attn_params,
                    int                          start_id,
                    int                          end_id,
-                    float                        cache_max_block_count,
                    int                          cache_block_seq_len,
-                    int                          cache_chunk_size,
                    int                          quant_policy,
                    bool                         use_context_fmha,
+                    const EngineParams&          engine_params,
                    std::shared_ptr<SharedState> shared_state,
                    LlamaWeight<T>*              weights,
                    NcclParam                    tensor_para,
@@ -89,7 +85,6 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
    is_free_buffer_after_forward_(is_free_buffer_after_forward),
    cuda_device_prop_(cuda_device_prop),
    debug_(isDebug()),
-    step_length_(step_length),
    shared_state_(shared_state)

 {
@@ -99,38 +94,7 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
    vocab_size_padded_ =
        (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;

-    size_t elem_bits = 0;
-    if (quant_policy & QuantPolicy::kCacheKVInt8) {
-        elem_bits = sizeof(int8_t) * 8;
-    }
-    else {
-        elem_bits = sizeof(T) * 8;
-    }
-
-    const size_t local_kv_head_num = kv_head_num / tensor_para.world_size_;
-
-    auto sequence_manager = std::make_unique<SequenceManager>(num_layer,
-                                                              local_kv_head_num,
-                                                              size_per_head_,
-                                                              cache_block_seq_len,
-                                                              cache_max_block_count,
-                                                              cache_chunk_size,
-                                                              elem_bits,
-                                                              tensor_para_.rank_,
-                                                              allocator);
-
-    const size_t max_session_len = sequence_manager->max_block_count() * cache_block_seq_len;
-    if (max_session_len < session_len) {
-        if (tensor_para.rank_ == 0) {
-            TM_LOG_WARNING("No enough blocks for `session_len` (%d), `session_len` truncated to %d.",
-                           session_len,
-                           max_session_len);
-        }
-        session_len = max_session_len;
-    }
-
-    batch_ = std::make_unique<LlamaBatch<T>>(
-        max_batch_size, max_context_token_num, session_len, std::move(sequence_manager), this);
+    batch_ = std::make_unique<LlamaBatch<T>>(engine_params, cache_block_seq_len, quant_policy, this);

    initialize(attn_params, kv_head_num, use_context_fmha, cache_block_seq_len, quant_policy);

@@ -141,9 +105,8 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
 template<typename T>
 LlamaV2<T>::~LlamaV2()
 {
-    delete decoder_;
+    unified_decoder_.reset();
    delete dynamic_decode_layer_;
-    delete context_decoder_;
 }

 template<typename T>
@@ -155,36 +118,21 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

-    context_decoder_ = new LlamaContextDecoder<T>(head_num_,
-                                                  kv_head_num,
-                                                  size_per_head_,
-                                                  inter_size_,
-                                                  num_layer_,
-                                                  attn_params,
-                                                  rmsnorm_eps_,
-                                                  tensor_para_,
-                                                  stream_,
-                                                  cublas_wrapper_,
-                                                  allocator_,
-                                                  is_free_buffer_after_forward_,
-                                                  use_context_fmha,
-                                                  cache_block_seq_len,
-                                                  quant_policy);
-
-    decoder_ = new LlamaDecoder<T>(head_num_,
-                                   kv_head_num,
-                                   size_per_head_,
-                                   inter_size_,
-                                   num_layer_,
-                                   attn_params,
-                                   rmsnorm_eps_,
-                                   tensor_para_,
-                                   stream_,
-                                   cublas_wrapper_,
-                                   allocator_,
-                                   is_free_buffer_after_forward_,
-                                   cache_block_seq_len,
-                                   quant_policy);
+    unified_decoder_.reset(new UnifiedDecoder<T>(head_num_,
+                                                 kv_head_num,
+                                                 size_per_head_,
+                                                 inter_size_,
+                                                 num_layer_,
+                                                 attn_params,
+                                                 rmsnorm_eps_,
+                                                 tensor_para_,
+                                                 stream_,
+                                                 cublas_wrapper_,
+                                                 allocator_,
+                                                 is_free_buffer_after_forward_,
+                                                 use_context_fmha,
+                                                 cache_block_seq_len,
+                                                 quant_policy));

    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
                                                          vocab_size_padded_,
@@ -218,31 +166,32 @@ void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int ba
 }

 template<typename T>
-void LlamaV2<T>::contextDecode(T*           decoder_output,
-                               uintptr_t*   k_cache_ptr,
-                               uintptr_t*   v_cache_ptr,
-                               void**       tmp_k_ptrs,
-                               void**       tmp_v_ptrs,
-                               T*           context_decoder_input_buf,
-                               T*           context_decoder_output_buf,
-                               const int*   input_ids,
-                               const int*   input_length,
-                               const int*   context_length,
-                               const int*   cu_block_counts,
-                               const float* rope_theta,
-                               size_t       token_num,
-                               size_t       max_input_len,
-                               size_t       max_context_len,
-                               size_t       session_len,
-                               size_t       batch_size)
+void LlamaV2<T>::forwardUnified(T*           out,
+                                T*           decoder_output,
+                                T*           decoder_input,
+                                void**       k_block_ptrs,
+                                void**       v_block_ptrs,
+                                const int*   input_ids,
+                                const int*   cu_block_cnts,
+                                const float* rope_theta,
+                                const bool*  dc_finished,
+                                const int*   pf_input_length,
+                                const int*   pf_context_length,
+                                T**          pf_tmp_k_ptrs,
+                                T**          pf_tmp_v_ptrs,
+                                size_t       token_num,
+                                int          dc_batch_size,
+                                int          dc_step,
+                                int          dc_sum_seq_len,
+                                int          dc_max_seq_len,
+                                int          pf_batch_size,
+                                int          pf_max_input_len,
+                                int          pf_max_context_len,
+                                int          pf_session_len)
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

-    if (tensor_para_.rank_ == 0) {
-        TM_LOG_INFO("context decoding start");
-    }
-
-    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
+    invokeInputIdsEmbeddingLookupPosEncoding(decoder_input,
                                             nullptr,  // processed somewhere else
                                             weights_->pre_decoder_embedding_table,
                                             static_cast<T*>(nullptr),
@@ -256,81 +205,32 @@ void LlamaV2<T>::contextDecode(T*           decoder_output,
                                             stream_);
    sync_check_cuda_error();

-    const auto dtype = getTensorType<T>();
-    const auto bsz   = batch_size;
-
-    const int max_q_len   = max_input_len;
-    const int max_kv_len  = max_context_len;
-    const int max_seq_len = session_len;
-
-    std::unordered_map<std::string, Tensor> decoder_input_tensors{
-        {"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_input_buf}},
-        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
-        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, input_length}},
-        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, context_length}},
-        {"max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_q_len}},
-        {"max_kv_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_kv_len}},
-        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
-        {"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}},
-        {"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {batch_size}, cu_block_counts}}};
-
-    std::unordered_map<std::string, Tensor> decoder_output_tensors{
-        {"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_output_buf}},
-        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
-        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
-        {"tmp_k", {MEMORY_GPU, TYPE_UINT64, {bsz}, tmp_k_ptrs}},
-        {"tmp_v", {MEMORY_GPU, TYPE_UINT64, {bsz}, tmp_v_ptrs}},
-        {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, decoder_output}}};
-
-    context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
-
-    if (tensor_para_.rank_ == 0) {
-        TM_LOG_INFO("context decoding end");
-    }
-}
-
-template<typename T>
-void LlamaV2<T>::decoderForward(T*           decoder_output,
-                                uintptr_t*   k_cache_ptr,
-                                uintptr_t*   v_cache_ptr,
-                                T*           decoder_input,
-                                const int*   sequence_length,
-                                const bool*  finished,
-                                const int*   cu_block_counts,
-                                const float* rope_theta,
-                                int          step,
-                                int          ite,
-                                int          sum_seq_len,
-                                int          max_seq_len,
-                                size_t       batch_size)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    const auto dtype = getTensorType<T>();
-
-    // max_input_length is not used w/o linear_bias_slopes
-    // sequence_lengths_ will be incremented in dynamic decode
-    std::unordered_map<std::string, Tensor> decoder_input_tensors{
-        {"decoder_input", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_input}},
-        {"sequence_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
-        {"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {batch_size}, cu_block_counts}},
-        {"sum_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &sum_seq_len}},
-        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
-        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
-        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
-        {"rope_theta", {MEMORY_GPU, TYPE_FP32, {batch_size}, rope_theta}},
-        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
-        {"ite", {MEMORY_CPU, TYPE_INT32, {1}, &ite}},
-    };
-
-    // LOG(ERROR) << key_cache_ << " " << value_cache_;
-    std::unordered_map<std::string, Tensor> decoder_output_tensors{
-        {"decoder_output", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_output}},
-        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, k_cache_ptr}},
-        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, v_cache_ptr}},
-    };
-
-    decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
+    const auto   dtype = getTensorType<T>();
+    const size_t bsz   = dc_batch_size + pf_batch_size;
+
+    TensorMap inputs{{"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_input}},
+                     {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
+                     {"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, pf_input_length}},
+                     {"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, pf_context_length}},
+                     {"dc_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &dc_batch_size}},
+                     {"dc_sum_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &dc_sum_seq_len}},
+                     {"dc_max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &dc_max_seq_len}},
+                     {"finished", {MEMORY_GPU, TYPE_BOOL, {bsz}, dc_finished}},
+                     {"pf_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &pf_batch_size}},
+                     {"pf_max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_max_input_len}},
+                     {"pf_max_k_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_max_context_len}},
+                     {"session_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_session_len}},
+                     {"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}},
+                     {"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {bsz}, cu_block_cnts}}};
+
+    TensorMap outputs{{"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_output}},
+                      {"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_block_ptrs}},
+                      {"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_block_ptrs}},
+                      {"tmp_k", {MEMORY_GPU, TYPE_UINT64, {bsz}, pf_tmp_k_ptrs}},
+                      {"tmp_v", {MEMORY_GPU, TYPE_UINT64, {bsz}, pf_tmp_v_ptrs}},
+                      {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, out}}};
+
+    unified_decoder_->forward(&outputs, &inputs, &weights_->decoder_layer_weights);
 }

 template<typename T>

--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -24,12 +24,11 @@
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
-#include "src/turbomind/models/llama/LlamaContextDecoder.h"
-#include "src/turbomind/models/llama/LlamaDecoder.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/instance_comm.h"
@@ -59,19 +58,14 @@ public:
            size_t                       inter_size,
            size_t                       num_layer,
            size_t                       vocab_size,
-            const LlamaAttentionParams&  attn_params,
            float                        norm_eps,
-            int                          max_batch_size,
-            int                          max_context_token_num,
-            int                          session_len,
-            int                          step_length,
+            const LlamaAttentionParams&  attn_params,
            int                          start_id,
            int                          end_id,
-            float                        cache_max_block_count,
            int                          cache_block_seq_len,
-            int                          cache_chunk_size,
            int                          quant_policy,
            bool                         use_context_fmha,
+            const EngineParams&          engine_params,
            std::shared_ptr<SharedState> shared_state,
            LlamaWeight<T>*              weights,
            NcclParam                    tensor_para,
@@ -113,37 +107,28 @@ private:

    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);

-    void contextDecode(T*           decoder_output,
-                       uintptr_t*   k_block_ptrs,
-                       uintptr_t*   v_block_ptrs,
-                       void**       k_tmp_ptrs,
-                       void**       v_tmp_ptrs,
-                       T*           context_decoder_input_buf,
-                       T*           context_decoder_output_buf,
-                       const int*   input_ids,
-                       const int*   input_length,
-                       const int*   context_length,
-                       const int*   cu_block_counts,
-                       const float* rope_theta,
-                       size_t       token_num,
-                       size_t       max_input_len,
-                       size_t       max_context_len,
-                       size_t       session_len,
-                       size_t       batch_size);
-
-    void decoderForward(T*           decoder_output,
-                        uintptr_t*   k_cache_ptr,
-                        uintptr_t*   v_cache_ptr,
+    void forwardUnified(T*           out,
+                        T*           decoder_output,
                        T*           decoder_input,
-                        const int*   sequence_length,
-                        const bool*  finished,
-                        const int*   cu_block_counts,
+                        void**       k_block_ptrs,
+                        void**       v_block_ptrs,
+                        const int*   input_ids,
+                        const int*   cu_block_cnts,
                        const float* rope_theta,
-                        int          step,
-                        int          ite,
-                        int          sum_seq_len,
-                        int          max_seq_len,
-                        size_t       batch_size);
+                        const bool*  dc_finished,
+                        const int*   pf_input_length,
+                        const int*   pf_context_length,
+                        T**          pf_tmp_k_ptrs,
+                        T**          pf_tmp_v_ptrs,
+                        size_t       token_num,
+                        int          dc_batch_size,
+                        int          dc_step,
+                        int          dc_sum_seq_len,
+                        int          dc_max_seq_len,
+                        int          pf_batch_size,
+                        int          pf_max_input_len,
+                        int          pf_max_context_len,
+                        int          pf_session_len);

    void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);

@@ -195,12 +180,11 @@ private:

    const bool debug_{false};

-    LlamaWeight<T>*            weights_{};
-    LlamaDecoder<T>*           decoder_{};
-    LlamaContextDecoder<T>*    context_decoder_{};
-    DynamicDecodeLayer<float>* dynamic_decode_layer_{};
+    LlamaWeight<T>* weights_{};
+
+    std::unique_ptr<UnifiedDecoder<T>> unified_decoder_;
+    DynamicDecodeLayer<float>*         dynamic_decode_layer_{};

-    const int                      step_length_;
    std::shared_ptr<SharedState>   shared_state_;
    ffi_api_lock_ctrl_t            ffi_lock_;
    std::unique_ptr<LlamaBatch<T>> batch_;