Unify prefill & decode passes (#775)

* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build

Unify prefill & decode passes (#775)
* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build
7f943a26 · Li Zhang · GitHub · 2ba90822 · 7f943a26 · 7f943a26
Unverified Commit 7f943a26 authored Dec 04, 2023 by Li Zhang Committed by GitHub Dec 04, 2023
15 changed files
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -13,8 +13,8 @@
 namespace turbomind {

 struct Request {
-    uint64_t id;
-    uint64_t priority;
+    uint64_t id;         // sequence id
+    uint64_t unique_id;  // monotonic increasing

    bool start_flag;
    bool end_flag;

--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -36,7 +36,7 @@ SequenceManager::SequenceManager(size_t      layer_num,

 const Sequence* SequenceManager::Create(uint64_t id)
 {
-    Sequence sequence{id, {}, {}, {}, {}, {}, {}, 0.f};
+    Sequence sequence{id};

    auto it = sequences_.find(id);
    if (it != sequences_.end()) {
@@ -152,18 +152,23 @@ struct Schedule {

    int last;

+    int input_count1;
+    int input_count2;
+
    Sequences        active;
    std::vector<int> block_counts;
    Sequences        inactive;
    Sequences        victims;

-    Schedule(Snapshot snapshot, int size):
+    Schedule(Snapshot snapshot, int size, int _input_count1, int _input_count2):
        free(snapshot.free),
        cached(snapshot.cached),
        last(size),
        use_count_(std::move(snapshot.use_count)),
        unlocked_(size),
-        it_(size)
+        it_(size),
+        input_count1(_input_count1),
+        input_count2(_input_count2)
    {
    }

@@ -208,6 +213,7 @@ std::ostream& operator<<(std::ostream& os, const Schedule& s)
 struct Transaction {
    int index_;
    int block_count_;
+    int input_count_;

    int allocate_{};
    int evict_{};
@@ -218,44 +224,45 @@ struct Transaction {
    const Sequences& sequences_;
    Schedule&        schedule_;

-    explicit Transaction(const Sequences& sequences, int index, int block_count, Schedule& sched):
-        sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count)
+    explicit Transaction(const Sequences& sequences, int index, int block_count, int input_count, Schedule& sched):
+        sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count), input_count_(input_count)
    {
    }

    void Process()
    {
-        int count = block_count_;
+        if (schedule_.input_count1 > 0) {
+            int count = block_count_;
+
+            int tmp = std::min(schedule_.free, count);
+            count -= tmp;
+            allocate_ += tmp;

-        int tmp = std::min(schedule_.free, count);
-        count -= tmp;
-        allocate_ += tmp;
+            tmp = std::min(schedule_.cached, count);
+            count -= tmp;
+            evict_ += tmp;

-        tmp = std::min(schedule_.cached, count);
-        count -= tmp;
-        evict_ += tmp;
+            for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
+                if (sequences_[vidx]->status == Sequence::kCached) {
+                    continue;
+                }
+                victims_.push_back(sequences_[vidx]);
+                preempt_ += schedule_.Unlock(sequences_, vidx);

-        for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
-            if (sequences_[vidx]->status == Sequence::kCached) {
-                continue;
+                if (count <= preempt_) {
+                    evict_ += count;
+                    count -= count;
+                    schedule_.last = vidx;  // ! modifiying `sched_.last` is part of commit
+                    break;
+                }
            }
-            victims_.push_back(sequences_[vidx]);
-            preempt_ += schedule_.Unlock(sequences_, vidx);
-
-            if (count <= preempt_) {
-                evict_ += count;
-                count -= count;
-                schedule_.last = vidx;  // ! modifiying `sched_.last` is part of commit
-                break;
+            if (count == 0) {
+                return Commit();
            }
        }

-        if (count == 0) {
-            Commit();
-        }
-        else {
-            schedule_.inactive.push_back(sequences_[index_]);
-        }
+        const_cast<Sequence*>(sequences_[index_])->input_length = 0;
+        schedule_.inactive.push_back(sequences_[index_]);
    }

    void Commit()
@@ -276,6 +283,13 @@ struct Transaction {
        // update active sequences
        schedule_.active.push_back(sequences_[index_]);
        schedule_.block_counts.push_back(block_count_);
+
+        if (input_count_ > schedule_.input_count2) {
+            input_count_ = schedule_.input_count1;
+        }
+        schedule_.input_count1 -= input_count_;
+        schedule_.input_count2 -= input_count_;
+        const_cast<Sequence*>(sequences_[index_])->input_length = input_count_;
    }
 };

@@ -308,6 +322,25 @@ void SequenceManager::SortByPriority(Sequences&                   sequences,
    context_lengths.swap(tmp_lengths);
 }

+// template<class P, class... Ts>
+// void SortByPriority(const std::vector<P>& priorities, Ts&... ranges)
+// {
+//     // sort according to priority
+//     std::vector<int> idxs(priorities.size());
+//     std::iota(idxs.begin(), idxs.end(), 0);
+//     std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {
+//         return priorities[i] < priorities[j];  //
+//     });
+//     auto reorder = [&](auto& src) {
+//         auto dst = src;
+//         for (size_t i = 0; i < idxs.size(); ++i) {
+//             dst[i] = src[idxs[i]];
+//         }
+//         src.swap(dst);
+//     };
+//     (reorder(ranges), ...);
+// }
+
 std::vector<int> SequenceManager::CountRequiredBlocks(const Sequences&        sequences,
                                                      const std::vector<int>& context_lengths,
                                                      int                     step_length)
@@ -344,7 +377,8 @@ void SequenceManager::AssignAndActivate(const Sequences&                 sequenc
 auto SequenceManager::Materialize(Sequences                    sequences,
                                  std::vector<int>             context_lengths,
                                  const std::vector<uint64_t>& priorities,
-                                  int                          step_length) -> Outcome
+                                  int                          step_length,
+                                  AdjustInputCount             adjust) -> Outcome
 {
    ////////////////////////////////////////////////////////////////////////////////
    /// Schedule the assignment of blocks to sequences
@@ -354,18 +388,23 @@ auto SequenceManager::Materialize(Sequences                    sequences,

    SortByPriority(sequences, context_lengths, priorities);

+    // SortByPriority(priorities, sequences, context_lengths);
+
    // Verify and lock cache sequences to avoid their blocks being evicted unnoticed
    // the blocks can still be preempted later
    VerifyAndLockCached(sequences);

+    auto [input_count1, input_count2] = adjust(sequences, context_lengths);
+
    std::vector<int> required = CountRequiredBlocks(sequences, context_lengths, step_length);
    // dbg(required);

-    Schedule schedule(block_manager_->TakeSnapshot(), sequences.size());
+    Schedule schedule(block_manager_->TakeSnapshot(), sequences.size(), input_count1, input_count2);

    // `schedule.last` is decreasing in the loop
    for (int i = 0; i < schedule.last; ++i) {
-        Transaction{sequences, i, required[i], schedule}.Process();
+        const int input_length = context_lengths[i] - sequences[i]->cache_len;
+        Transaction{sequences, i, required[i], input_length, schedule}.Process();
    }

    // mark remaining sequences invalid

--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -3,6 +3,7 @@
 #pragma once

 #include "src/turbomind/models/llama/BlockManager.h"
+#include <functional>

 namespace turbomind {

@@ -16,19 +17,23 @@ struct Sequence {
    };

    uint64_t id;
-    Status   status;
+    Status   status = kCached;

    std::vector<const Block*> blocks;
    std::vector<uint64_t>     block_unique_ids;

+    int input_length = 0;
+
    mutable std::vector<int> tokens;  // update by user

-    mutable int cache_len;
+    mutable int cache_len = 0;

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

-    mutable float rope_theta;
+    mutable float rope_theta = 0.f;
+
+    Sequence(uint64_t _id): id(_id) {}

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
 };
@@ -74,10 +79,13 @@ public:
        int swap_out;
    };

+    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;
+
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
-                                      int                          step_length);
+                                      int                          step_length,
+                                      AdjustInputCount             adjust);

    void* OffsetKey(void* block_ptr)
    {

--- a/src/turbomind/models/llama/copy.h
+++ b/src/turbomind/models/llama/copy.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+class BatchedCopy {
+public:
+    template<class T, std::enable_if_t<alignof(T) <= alignof(uint32_t), int> = 0>
+    T* Add(const T* src, int size, T* dst)
+    {
+        src_.push_back((void*)src);
+        dst_.push_back((void*)dst);
+        size_.push_back(sizeof(T) * size);
+        return dst + size;
+    }
+
+    void Submit(cudaStream_t stream)
+    {
+        invokeBatchedCopy(src_.data(), dst_.data(), size_.data(), size_.size(), stream);
+        sync_check_cuda_error();
+
+        src_.clear();
+        dst_.clear();
+        size_.clear();
+    }
+
+private:
+    std::vector<void*> src_;
+    std::vector<void*> dst_;
+    std::vector<int>   size_;
+};
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/llama_decoder_kernels.cu
+++ b/src/turbomind/models/llama/llama_decoder_kernels.cu
@@ -101,6 +101,8 @@ __device__ T blockReduceSum(const cg::thread_block& block, T value)
    return cg::reduce(tile, value, cg::plus<float>{});
 }

+// r' = r + x
+// x' = norm(r') * scales
 template<typename T>
 __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
                                         T* __restrict__ x_data,

--- a/src/turbomind/models/llama/llama_kernels.cu
+++ b/src/turbomind/models/llama/llama_kernels.cu
@@ -9,11 +9,13 @@
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/dispatch.h"
 #include "src/turbomind/utils/logger.h"
 #include <algorithm>
 #include <cstdint>
 #include <cub/block/block_reduce.cuh>
 #include <type_traits>
+#include <utility>

 namespace turbomind {

@@ -543,8 +545,10 @@ __global__ void gatherOutput(int*       output_ids,
            continue;
        }
        // skip padding for dst
-        const int dst_idx   = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
-        output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+        const int dst_idx = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
+        if (dst_idx < max_output_len) {
+            output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+        }
    }
 }

@@ -694,50 +698,31 @@ void invokeIndexedCopyImpl(void**       h_src_ptr,
                           int          count,
                           cudaStream_t st)
 {
-    auto invoke = [&](auto max_count) {
-        constexpr int C = decltype(max_count)::value;
-        // maximum parameter size: sm<70: 4kB, sm>=70: 32kB
-        static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
-        IndexedCopyParam<N, C> param{};
-        std::copy_n(h_src_ptr, N, param.src_ptr.data());
-        std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
-        std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
-            // Basic alignment check
-            FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
-            return size / sizeof(T);
+    dispatch(  // dispatch for num of copy operations
+        std::integer_sequence<int, 4, 8, 16, 32, 64, 128, 256>{},
+        [&](auto C) { return count <= C; },
+        [&](auto C) {
+            // maximum parameter size: sm<70: 4kB, sm>=70: 32kB
+            static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
+            IndexedCopyParam<N, C> param{};
+            std::copy_n(h_src_ptr, N, param.src_ptr.data());
+            std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
+            std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
+                // Basic alignment check
+                FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
+                return size / sizeof(T);
+            });
+            param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
+            auto copy_idx    = [](const int* src, int offset, int n, auto dst) {
+                return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
+            };
+            for (int c = 0; c < count; c += C) {
+                int batch_size = std::min(count - c, (int)C);
+                copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
+                copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
+                indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
+            }
        });
-        param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
-        auto copy_idx    = [](const int* src, int offset, int n, auto dst) {
-            return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
-        };
-        for (int c = 0; c < count; c += C) {
-            int batch_size = std::min(count - c, C);
-            copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
-            copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
-            indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
-        }
-    };
-    if (count <= 4) {
-        invoke(std::integral_constant<int, 4>{});
-    }
-    if (count <= 8) {
-        invoke(std::integral_constant<int, 8>{});
-    }
-    else if (count <= 16) {
-        invoke(std::integral_constant<int, 16>{});
-    }
-    else if (count <= 32) {
-        invoke(std::integral_constant<int, 32>{});
-    }
-    else if (count <= 64) {
-        invoke(std::integral_constant<int, 64>{});
-    }
-    else if (count <= 128) {
-        invoke(std::integral_constant<int, 128>{});
-    }
-    else {
-        invoke(std::integral_constant<int, 256>{});
-    }
 }

 void invokeIndexedCopy(void**       h_src_ptr,
@@ -749,19 +734,14 @@ void invokeIndexedCopy(void**       h_src_ptr,
                       int          n_copys,
                       cudaStream_t st)
 {
-    auto args = std::tuple{h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st};
-    switch (n_copys) {
-        case 1:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 1>, args);
-        case 2:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 2>, args);
-        case 3:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 3>, args);
-        case 4:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 4>, args);
-        default:
-            FT_CHECK(0);
-    }
+    auto success = dispatch(std::integer_sequence<int, 1, 2, 3, 4>{}, [&](auto N) {
+        if (N == n_copys) {
+            invokeIndexedCopyImpl<uint32_t, N>(h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st);
+            return true;
+        }
+        return false;
+    });
+    FT_CHECK(success);
 }

 __global__ void padLastTokenIds(int* token_ids, const int* context_length, int max_context_len, int batch_size)
@@ -777,6 +757,96 @@ void invokePadLastTokenIds(
    padLastTokenIds<<<1, 512, 0, stream>>>(token_ids, context_length, max_context_len, batch_size);
 }

+template<typename T>
+__global__ void getFeatureOfLastToken(T* output, const T* input, const int* cu_seqlens, int dims)
+{
+    int bi = blockIdx.x;
+    int ti = cu_seqlens[bi + 1] - 1;
+    for (int i = threadIdx.x; i < dims; i += blockDim.x) {
+        output[dims * bi + i] = input[dims * ti + i];
+    }
+}
+
+template<typename T>
+void invokeGetFeatureOfLastToken(
+    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream)
+{
+    getFeatureOfLastToken<<<batch_size, 256, 0, stream>>>(output, input, cu_seqlens, dims);
+}
+
+template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, int, cudaStream_t);
+template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t);
+
+template<class T, int C>
+struct BatchedCopyParam {
+    Array<T*, C>  src_ptr;
+    Array<T*, C>  dst_ptr;
+    Array<int, C> size;
+    int           count;
+};
+
+template<int kThrPerCpy, class T, int C>
+__global__ void batchedCopy(BatchedCopyParam<T, C> param)
+{
+    const int ti = threadIdx.x + blockIdx.x * blockDim.x;
+    const int bi = ti / kThrPerCpy;
+    if (bi >= param.count) {
+        return;
+    }
+    const T* __restrict__ src = param.src_ptr[bi];
+    T* __restrict__ dst       = param.dst_ptr[bi];
+    int size                  = param.size[bi];
+    for (int i = ti % kThrPerCpy; i < size; i += kThrPerCpy) {
+        dst[i] = src[i];
+    }
+}
+
+// MSVC does not like CUDA kernel launch inside nested lambdas
+template<class P>
+struct BatchedCopyLauncher {
+    int          max_size;
+    int          count;
+    const P*     params;
+    cudaStream_t st;
+
+    template<int S>
+    void operator()(std::integral_constant<int, S>) const
+    {
+        constexpr int threads         = 128;
+        constexpr int items_per_block = threads / S;
+        const int     blocks          = (count + items_per_block - 1) / items_per_block;
+        batchedCopy<S><<<blocks, threads, 0, st>>>(*params);
+    }
+};
+
+void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st)
+{
+    dispatch(
+        std::integer_sequence<int, 1, 8, 32, 128>{},
+        [&](auto C) { return count <= C; },
+        [&](auto C) {
+            using T = uint32_t;
+            BatchedCopyParam<T, C> params{};
+            // TODO: on CUDA 12.1 and sm_70+ this can be 32K
+            static_assert(sizeof(params) <= 4096);
+            for (int c = 0; c < count; c += C) {
+                const int bsz = std::min<int>(count - c, C);
+                params.count  = bsz;
+                for (int i = 0; i < bsz; ++i) {
+                    params.src_ptr[i] = (T*)src_ptr[c + i];
+                    params.dst_ptr[i] = (T*)dst_ptr[c + i];
+                    FT_CHECK(size[c + i] % sizeof(T) == 0);
+                    params.size[i] = size[c + i] / sizeof(T);
+                }
+                const int max_size = *std::max_element(params.size.begin(), params.size.end());
+                dispatch(
+                    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, 64, 128>{},
+                    [&](auto S) { return max_size <= S; },
+                    BatchedCopyLauncher<BatchedCopyParam<T, C>>{max_size, count, &params, st});
+            }
+        });
+}
+
 #define VERSION_SWITCH(VERSION, CONST_NAME, ...)                                                                       \
    [&] {                                                                                                              \
        if (VERSION == 2) {                                                                                            \

--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -105,6 +105,8 @@ void invokeIndexedCopy(void**       h_src_ptr,
                       int          n_copys,
                       cudaStream_t st);

+void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st);
+
 // ABCDe            ABCDe     e
 // ABCDEFGHIJk      ABCDEFGHIJk
 // ABCDEFGHi    ->  ABCDEFGHi i
@@ -113,6 +115,10 @@ void invokeIndexedCopy(void**       h_src_ptr,
 void invokePadLastTokenIds(
    int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream);

+template<typename T>
+void invokeGetFeatureOfLastToken(
+    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream);
+
 void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);

 template<typename T>

--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -13,4 +13,21 @@ struct LlamaAttentionParams {
    bool use_logn_attn;
 };

+struct EngineParams {
+    // batch params
+    int max_batch_size;
+    int session_len;
+    int step_length;
+
+    // cache params
+    float cache_max_block_count;
+    int   cache_chunk_size;
+
+    // chunking params
+    int max_context_token_num;
+    int num_tokens_per_iter;
+    int extra_tokens_per_iter;
+    int max_prefill_iters;
+};
+
 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -19,8 +19,9 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.cc

-#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.h"
 #include "src/turbomind/kernels/decoder_multihead_attention/kv_cache.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/macro.h"
@@ -35,10 +36,14 @@
 namespace turbomind {

 template<typename T>
-void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
-                                                   size_t num_token,
-                                                   size_t max_q_len,
-                                                   size_t max_k_len)
+// void UnifiedAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t
+// max_k_len)
+void UnifiedAttentionLayer<T>::allocateBuffer(size_t num_token,
+                                              size_t pf_batch_size,
+                                              size_t pf_max_q_len,
+                                              size_t pf_max_k_len,
+                                              size_t dc_batch_size,
+                                              size_t dc_max_split_k)
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

@@ -47,65 +52,73 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
    // no padding
    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * local_q_kv_head_num * size_per_head_, false);

-    // padding is rebuilt for q/k/v_buf_2_
-    // [qH + 2kvH, B, S, D]
-    q_buf_2_ = (T*)allocator_->reMalloc(
-        q_buf_2_, sizeof(T) * local_q_kv_head_num * batch_size * max_q_len * size_per_head_, false);
-    k_buf_2_ = q_buf_2_ + local_head_num_ * batch_size * max_q_len * size_per_head_;
-    v_buf_2_ = k_buf_2_ + local_kv_head_num_ * batch_size * max_q_len * size_per_head_;
+    // qkv_buf_3_ padding is removed
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_head_num_ * size_per_head_, false);

-    if (use_fmha_) {
-        FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
-        if (flash_attention.get_workspace_size() > 0) {
-            qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), false);
-        }
+    if (pf_batch_size) {
+        [&](size_t bsz, size_t max_q, size_t max_k) {
+            // padding is rebuilt for q/k/v_buf_2_
+            // [qH + 2kvH, B, S, D]
+            q_buf_2_ = (T*)allocator_->reMalloc(
+                q_buf_2_, sizeof(T) * local_q_kv_head_num * bsz * max_q * size_per_head_, false);
+            k_buf_2_ = q_buf_2_ + local_head_num_ * bsz * max_q * size_per_head_;
+            v_buf_2_ = k_buf_2_ + local_kv_head_num_ * bsz * max_q * size_per_head_;
+
+            if (use_fmha_) {
+                FlashAttentionOp<T> flash_attention(bsz, local_head_num_, max_k, max_q, size_per_head_);
+                if (flash_attention.get_workspace_size() > 0) {
+                    qk_buf_float_ =
+                        (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), false);
+                }
+            }
+            else {
+                // kv heads are repeated for unfused attention
+                k_cache_buf_ = (T*)allocator_->reMalloc(
+                    k_cache_buf_, 2 * sizeof(T) * bsz * local_head_num_ * max_k * size_per_head_, false);
+                v_cache_buf_ = k_cache_buf_ + bsz * local_head_num_ * max_k * size_per_head_;
+
+                qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * bsz * local_head_num_ * max_q * max_k, false);
+
+                // qkv_buf_2_ has padding
+                qkv_buf_2_ = (T*)allocator_->reMalloc(
+                    qkv_buf_2_, sizeof(T) * bsz * max_q * local_head_num_ * size_per_head_, false);
+            }
+        }(pf_batch_size, pf_max_q_len, pf_max_k_len);
    }
-    else {
-        // kv heads are repeated for unfused attention
-        k_cache_buf_ = (T*)allocator_->reMalloc(
-            k_cache_buf_, 2 * sizeof(T) * batch_size * local_head_num_ * max_k_len * size_per_head_, false);
-        v_cache_buf_ = k_cache_buf_ + batch_size * local_head_num_ * max_k_len * size_per_head_;
-
-        qk_buf_ =
-            (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * max_q_len * max_k_len, false);

-        // qkv_buf_2_ has padding
-        qkv_buf_2_ = (T*)allocator_->reMalloc(
-            qkv_buf_2_, sizeof(T) * batch_size * max_q_len * local_head_num_ * size_per_head_, false);
+    if (dc_batch_size) {
+        dc_workspace_ = (float*)allocator_->reMalloc(dc_workspace_,
+                                                     sizeof(float) * dc_batch_size * local_head_num_ * dc_max_split_k
+                                                         * (size_per_head_ + 2),
+                                                     false);
    }

-    // qkv_buf_3_ padding is removed
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_head_num_ * size_per_head_, false);
-
    is_allocate_buffer_ = true;
 }

 template<typename T>
-void LlamaContextAttentionLayer<T>::freeBuffer()
+void UnifiedAttentionLayer<T>::freeBuffer()
 {
    if (is_allocate_buffer_) {
        TM_LOG_DEBUG(__PRETTY_FUNCTION__);

        allocator_->free((void**)(&qkv_buf_));
        allocator_->free((void**)(&q_buf_2_));
-        if (use_fmha_) {
-            allocator_->free((void**)&qk_buf_float_);
-        }
-        else {
-            allocator_->free((void**)(&k_cache_buf_));
-            allocator_->free((void**)(&qk_buf_));
-            allocator_->free((void**)(&qkv_buf_2_));
-        }
        allocator_->free((void**)(&qkv_buf_3_));

+        allocator_->free((void**)&qk_buf_float_);
+        allocator_->free((void**)(&k_cache_buf_));
+        allocator_->free((void**)(&qk_buf_));
+        allocator_->free((void**)(&qkv_buf_2_));
+
+        allocator_->free((void**)&dc_workspace_);
+
        is_allocate_buffer_ = false;
    }
 }

 template<typename T>
-inline void LlamaContextAttentionLayer<T>::forward(TensorMap*                     output_tensors,
-                                                   const TensorMap*               input_tensors,
-                                                   const LlamaAttentionWeight<T>* weights)
+inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights)
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

@@ -131,60 +144,153 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*

    /////////////////////////////////////////////
    /// parse inputs
-    const int batch_size = input_tensors->at("attention_mask").shape[0];
-    const int max_q_len  = input_tensors->at("attention_mask").shape[2];
-    const int max_k_len  = input_tensors->at("attention_mask").shape[3];
-    const int layer_id   = input_tensors->getVal<int>("layer_id");
-
-    const int num_token = input_tensors->at("input_query").shape[0];
-
-    const int max_seq_len = input_tensors->at("max_seq_len").getVal<int>();
+    const int num_token   = inputs->at("input_query").shape[0];
+    const int layer_id    = inputs->getVal<int>("layer_id");
+    const int session_len = inputs->getVal<int>("session_len");
+
+    int pf_batch_size = 0;
+    int pf_max_q_len  = 0;
+    int pf_max_k_len  = 0;
+    T*  attention_mask{};
+    if (inputs->isExist("attention_mask")) {
+        pf_batch_size  = inputs->at("attention_mask").shape[0];
+        pf_max_q_len   = inputs->at("attention_mask").shape[2];
+        pf_max_k_len   = inputs->at("attention_mask").shape[3];
+        attention_mask = inputs->getPtr<T>("attention_mask");
+    }

-    T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
-    T* attention_input = input_tensors->at("input_query").getPtr<T>();
-    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+    const int dc_batch_size  = inputs->getVal<int>("dc_batch_size");
+    const int dc_sum_seq_len = inputs->getVal<int>("dc_sum_seq_len");
+    const int dc_max_seq_len = inputs->getVal<int>("dc_max_seq_len");

-    const auto input_length    = input_tensors->at("input_lengths").getPtr<const int>();
-    const auto context_length  = input_tensors->at("context_lengths").getPtr<const int>();
-    int*       cu_seqlens      = input_tensors->at("cu_seqlens").getPtr<int>();
-    int*       cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
+    T*     attention_input = inputs->getPtr<T>("input_query");
+    int*   input_length    = inputs->getPtr<int>("input_lengths");
+    int*   context_length  = inputs->getPtr<int>("context_lengths");
+    bool*  is_finished     = inputs->getPtr<bool>("finished");
+    int*   cu_block_count  = inputs->getPtr<int>("cu_block_counts");
+    int*   cu_seqlens      = inputs->getPtr<int>("cu_seqlens", nullptr);
+    int*   padding_offset  = inputs->getPtr<int>("padding_offset", nullptr);
+    float* rope_theta      = inputs->getPtr<float>("rope_theta", nullptr);

-    const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);
+    auto k_cache_ptrs = outputs->getPtr<void*>("key_cache");
+    auto v_cache_ptrs = outputs->getPtr<void*>("value_cache");
+    auto tmp_k_ptrs   = outputs->getPtr<T*>("tmp_k");
+    auto tmp_v_ptrs   = outputs->getPtr<T*>("tmp_v");

-    const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
-
-    auto Show = [&](const T* x, size_t n) {
-        std::vector<T> vec(n);
-        cudaMemcpyAsync(vec.data(), x, sizeof(T) * n, cudaMemcpyDefault, stream_);
-        cudaStreamSynchronize(stream_);
-        std::vector<float> float_vec(vec.begin(), vec.end());
-        dbg(float_vec);
-    };
+    T* attention_out = outputs->getPtr<T>("hidden_features");

    /////////////////////////////////////////////
    /// allocate buffers
-    allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
+    allocateBuffer(num_token,  //
+                   pf_batch_size,
+                   pf_max_q_len,
+                   pf_max_k_len,
+                   dc_batch_size,
+                   kDecodeMaxSplits);
+
+    // [2, L, H, s, D]
+    const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;

    //////////////////////////////////////////////
    /// qkv gemm
    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
    linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);

+    if (pf_batch_size) {
+        const int offset       = dc_batch_size;
+        const int pf_num_token = num_token - offset;
+        prefill(qkv_buf_3_ + offset * weights->output.input_dims,
+                qkv_buf_ + offset * weights->qkv.output_dims,
+                k_cache_ptrs,
+                v_cache_ptrs,
+                attention_mask,
+                cu_seqlens,
+                padding_offset,
+                tmp_k_ptrs + offset,
+                tmp_v_ptrs + offset,
+                input_length + offset,
+                context_length + offset,
+                cu_block_count + offset,
+                rope_theta + offset,
+                pf_batch_size,
+                pf_num_token,
+                layer_offset,
+                pf_max_q_len,
+                pf_max_k_len,
+                session_len,
+                weights);
+    }
+
+    if (dc_batch_size) {
+        decode(qkv_buf_3_,
+               qkv_buf_,
+               k_cache_ptrs,
+               v_cache_ptrs,
+               cu_block_count,
+               context_length,
+               is_finished,
+               rope_theta,
+               layer_offset,
+               dc_batch_size,
+               dc_sum_seq_len,
+               dc_max_seq_len,
+               kDecodeMaxSplits,
+               weights);
+    }
+
+    //////////////////////////////////////////////
+    /// output gemm <Bs,HD> -> <Bs,HD>
+    linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
+
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void UnifiedAttentionLayer<T>::prefill(T*                output,
+                                       const T*          qkv,
+                                       void**            k_cache_ptrs,
+                                       void**            v_cache_ptrs,
+                                       const T*          attention_mask,
+                                       const int*        cu_seqlens,
+                                       const int*        padding_offset,
+                                       T**               tmp_k_ptrs,
+                                       T**               tmp_v_ptrs,
+                                       const int*        input_length,
+                                       const int*        context_length,
+                                       const int*        cu_block_count,
+                                       const float*      rope_theta,
+                                       int               pf_batch_size,
+                                       int               pf_num_token,
+                                       size_t            layer_offset,
+                                       int               pf_max_q_len,
+                                       int               pf_max_k_len,
+                                       int               pf_session_len,
+                                       const WeightType* weights)
+{
    //////////////////////////////////////////////
    /// transpose qkv & apply rotary embedding & rebuild padding
    /// qkv [B, s, H + 2kvH, D] -> (q [B, H, s, D], k [B, kvH, s, D], v [B, kvH, s, D])
    invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                   k_buf_2_,
                                   v_buf_2_,
-                                   qkv_buf_,
+                                   (T*)qkv,
                                   weights->qkv.bias,
                                   padding_offset,  // padding_offset,
                                   context_length,  // used for applying rotary embedding
                                   input_length,
                                   rope_theta,
-                                   batch_size,
-                                   max_q_len,  // seq_len
-                                   num_token,  // batch_size * seq_len
+                                   pf_batch_size,
+                                   pf_max_q_len,  // seq_len
+                                   pf_num_token,
                                   local_head_num_,
                                   local_kv_head_num_,
                                   size_per_head_,
@@ -196,15 +302,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                   stream_);
    sync_check_cuda_error();

-    // [2, L, H, s, D]
-    const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
-
-    auto k_cache_ptrs = output_tensors->getPtr<void*>("key_cache");
-    auto v_cache_ptrs = output_tensors->getPtr<void*>("value_cache");
-
-    auto tmp_k_ptrs = output_tensors->getPtr<T*>("tmp_k");
-    auto tmp_v_ptrs = output_tensors->getPtr<T*>("tmp_v");
-
    //////////////////////////////////////////////////////////
    /// insert the k/v computed from inputs into k/v cache
    /// transpose kv -> kv cache
@@ -215,13 +312,13 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                        v_cache_ptrs,
                        k_buf_2_,
                        v_buf_2_,
-                        cu_block_counts,
+                        cu_block_count,
                        input_length,
                        context_length,
-                        batch_size,
+                        pf_batch_size,
                        kv_cache_block_len_,
                        layer_offset,
-                        max_q_len,
+                        pf_max_q_len,
                        size_per_head_,
                        local_kv_head_num_,
                        quant_policy_,
@@ -231,94 +328,147 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*

    const int kv_cache_elem_bits = quant_policy_ & QuantPolicy::kCacheKVInt8 ? 8 : sizeof(T) * 8;

+    FT_CHECK(weights->past_kv_scale.size() == 4);
    ConvertKvCacheBlocksToLinear2((const void**)k_cache_ptrs,
                                  (const void**)v_cache_ptrs,
                                  (T**)tmp_k_ptrs,
                                  (T**)tmp_v_ptrs,
-                                  cu_block_counts,
+                                  cu_block_count,
                                  context_length,
                                  layer_offset,
                                  kv_cache_block_len_,
-                                  max_seq_len,
+                                  pf_session_len,
                                  local_kv_head_num_,
                                  size_per_head_,
-                                  batch_size,
+                                  pf_batch_size,
                                  quant_policy_,
                                  weights->past_kv_scale.data(),
                                  stream_);
    sync_check_cuda_error();

-    // dbg(kv_cache_block_len_, max_seq_len, local_kv_head_num_, size_per_head_, batch_size);
-    // void *kk, *vv;
-    // cudaMemcpyAsync(&kk, tmp_k_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
-    // cudaMemcpyAsync(&vv, tmp_v_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
-    // cudaStreamSynchronize(stream_);
-    // Show((const T*)kk, local_kv_head_num_ * max_seq_len * size_per_head_);
-    // Show((const T*)vv, local_kv_head_num_ * max_seq_len * size_per_head_);
-
    if (use_fmha_) {
-        fusedMultiHeadAttention(tmp_k_ptrs,
+        fusedMultiHeadAttention(output,
+                                q_buf_2_,
+                                tmp_k_ptrs,
                                tmp_v_ptrs,
                                0,
-                                attention_mask,
-                                cu_seqlens,
-                                input_tensors->at("context_lengths").getPtr<int>(),
-                                batch_size,
-                                max_q_len,
-                                max_k_len,
-                                max_seq_len);
+                                (T*)attention_mask,
+                                (int*)cu_seqlens,
+                                (int*)context_length,
+                                pf_batch_size,
+                                pf_max_q_len,
+                                pf_max_k_len,
+                                pf_session_len);
    }
    else {
-        unfusedMultiHeadAttention(tmp_k_ptrs,
+        unfusedMultiHeadAttention(output,
+                                  q_buf_2_,
+                                  tmp_k_ptrs,
                                  tmp_v_ptrs,
                                  0,
                                  attention_mask,
                                  padding_offset,
                                  context_length,
-                                  batch_size,
-                                  num_token,
-                                  max_q_len,
-                                  max_k_len,
-                                  max_seq_len,
+                                  pf_batch_size,
+                                  pf_num_token,
+                                  pf_max_q_len,
+                                  pf_max_k_len,
+                                  pf_session_len,
                                  quant_policy_,
                                  weights->past_kv_scale.data());
    }
+}

-    // Compare(qkv_buf_3_, num_token * hidden_units_, Concat("qkv_buf_3", layer_id), kCmpRead, stream_);
+template<typename T>
+void UnifiedAttentionLayer<T>::decode(T*                output,
+                                      const T*          qkv,
+                                      void**            k_cache_ptrs,
+                                      void**            v_cache_ptrs,
+                                      const int*        cu_block_count,
+                                      const int*        context_length,
+                                      const bool*       is_finished,
+                                      const float*      rope_theta,
+                                      size_t            layer_offset,
+                                      int               batch_size,
+                                      int               dc_sum_seq_len,
+                                      int               dc_max_seq_len,
+                                      int               max_split_k,
+                                      const WeightType* weights)
+{
+    DecoderMultiHeadAttentionParams<T> params{};

-    // dbg(max_seq_len);
+    params.out    = output;
+    params.q      = (T*)qkv;
+    params.k      = params.q + local_head_num_ * size_per_head_;
+    params.v      = params.k + local_kv_head_num_ * size_per_head_;
+    params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;

-    if (0) {
-        Show(qkv_buf_3_, num_token * hidden_units_);
-    }
+    params.q_bias = weights->qkv.bias;
+    params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
+    params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;

-    //////////////////////////////////////////////
-    /// output gemm <Bs,HD> -> <Bs,HD>
-    linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
+    params.batch_size    = batch_size;
+    params.cu_block_cnts = (int*)cu_block_count;

-    if (tensor_para_.world_size_ > 1) {
-        NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
-        sync_check_cuda_error();
-    }
+    params.k_cache_block_ptrs  = (void**)k_cache_ptrs;
+    params.v_cache_block_ptrs  = (void**)v_cache_ptrs;
+    params.kv_cache_block_size = kv_cache_block_len_;

-    if (is_free_buffer_after_forward_ == true) {
-        freeBuffer();
+    params.finished       = is_finished;
+    params.context_length = context_length;
+    params.rope_theta     = rope_theta;
+
+    params.layer_offset = layer_offset;
+
+    params.num_heads     = local_head_num_;
+    params.num_kv_heads  = local_kv_head_num_;
+    params.size_per_head = size_per_head_;
+    params.inv_sqrt_dh   = 1.f / std::sqrt((float)params.size_per_head);
+
+    params.rotary_embedding_dim    = size_per_head_;
+    params.rotary_embedding_base   = params_.rotary_embedding_base;
+    params.max_position_embeddings = params_.max_position_embeddings;
+    // params.use_dynamic_ntk = params_.use_dynamic_ntk;
+    params.use_logn_attn = params_.use_logn_attn;
+
+    params.partial_O = dc_workspace_;
+    params.partial_M = params.partial_O + batch_size * local_head_num_ * max_split_k * size_per_head_;
+    params.partial_L = params.partial_M + batch_size * local_head_num_ * max_split_k;
+
+    const float avg_batch_size = dc_max_seq_len ? (float)dc_sum_seq_len / dc_max_seq_len : 1;
+    FT_CHECK(avg_batch_size >= 1.f);
+
+    max_split_k = std::max(1, (int)std::ceil(max_split_k / avg_batch_size));
+
+    params.max_split_k = max_split_k;
+    params.max_seq_len = dc_max_seq_len;
+
+    params.arch   = arch_;
+    params.stream = stream_;
+
+    params.quant_policy = quant_policy_;
+    FT_CHECK(std::size(weights->past_kv_scale) == std::size(params.kv_quant_params));
+    std::copy(weights->past_kv_scale.begin(), weights->past_kv_scale.end(), std::begin(params.kv_quant_params));
+
+    {
+        NvtxScope scope("decoder_multihead_attention");
+        DispatchDecoderMultiheadAttention<T>(params);
    }
-    sync_check_cuda_error();
 }

 template<typename T>
-void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptrs,
-                                                            T**    val_cache_ptrs,
-                                                            size_t cache_layer_offset,
-                                                            T*     attention_mask,
-                                                            int*   cu_seqlens,
-                                                            int*   context_lengths,
-                                                            int    batch_size,
-                                                            int    max_q_len,
-                                                            int    max_k_len,
-                                                            int    max_seq_len)
+void UnifiedAttentionLayer<T>::fusedMultiHeadAttention(T*       output,
+                                                       const T* query,
+                                                       T**      key_cache_ptrs,
+                                                       T**      val_cache_ptrs,
+                                                       size_t   cache_layer_offset,
+                                                       T*       attention_mask,
+                                                       int*     cu_seqlens,
+                                                       int*     context_lengths,
+                                                       int      batch_size,
+                                                       int      max_q_len,
+                                                       int      max_k_len,
+                                                       int      max_seq_len)
 {
    //////////////////////////////////////////////
    // flash attention
@@ -347,8 +497,8 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
    };
    size_t                       group_size = size_t(local_head_num_ / local_kv_head_num_);
    AttentionOp                  flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
-    typename AttentionOp::Params attn_params{qkv_buf_3_,
-                                             q_buf_2_,
+    typename AttentionOp::Params attn_params{output,
+                                             (T*)query,
                                             k_cache_buf_,
                                             v_cache_buf_,
                                             attention_mask,
@@ -368,19 +518,21 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
 }

 template<typename T>
-void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_cache_ptrs,
-                                                              T**          val_cache_ptrs,
-                                                              size_t       cache_layer_offset,
-                                                              const T*     attention_mask,
-                                                              const int*   padding_offset,
-                                                              const int*   context_length,
-                                                              int          batch_size,
-                                                              int          num_token,
-                                                              int          max_q_len,
-                                                              int          max_k_len,
-                                                              int          max_seq_len,
-                                                              int          quant,
-                                                              const float* kv_scale)
+void UnifiedAttentionLayer<T>::unfusedMultiHeadAttention(T*           output,
+                                                         const T*     query,
+                                                         T**          key_cache_ptrs,
+                                                         T**          val_cache_ptrs,
+                                                         size_t       cache_layer_offset,
+                                                         const T*     attention_mask,
+                                                         const int*   padding_offset,
+                                                         const int*   context_length,
+                                                         int          batch_size,
+                                                         int          num_token,
+                                                         int          max_q_len,
+                                                         int          max_k_len,
+                                                         int          max_seq_len,
+                                                         int          quant,
+                                                         const float* kv_scale)
 {
    // key_cache [B, kvH, S[:t+s], D/x, x] -> [B, qH, t+s, D]
    // val_cache [B, kvH, S[:t+s], D/x, x] -> [B, qH, t+s, D]
@@ -414,7 +566,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_c
                                        k_cache_buf_,                   // A
                                        size_per_head_,                 // lda
                                        max_k_len * size_per_head_,     // strideA
-                                        q_buf_2_,                       // B
+                                        query,                          // B
                                        size_per_head_,                 // ldb
                                        max_q_len * size_per_head_,     // strideB
                                        qk_buf_,                        // C
@@ -459,7 +611,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_c
    //////////////////////////////////////////////
    /// transpose <B,h,s,D> -> <B,s,h,D>
    invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
-                                             qkv_buf_3_,
+                                             output,
                                             num_token,
                                             batch_size,
                                             max_q_len,
@@ -472,7 +624,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_c
    sync_check_cuda_error();
 }

-template class LlamaContextAttentionLayer<float>;
-template class LlamaContextAttentionLayer<half>;
+template class UnifiedAttentionLayer<float>;
+template class UnifiedAttentionLayer<half>;

 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -25,28 +25,37 @@
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"

 namespace turbomind {

 template<typename T>
-class LlamaContextAttentionLayer {
+class UnifiedAttentionLayer {
 public:
+    using WeightType                      = LlamaAttentionWeight<T>;
+    static constexpr int kDecodeMaxSplits = 16;
+
    void freeBuffer();
-    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
-
-    LlamaContextAttentionLayer(size_t               head_num,
-                               size_t               kv_head_num,
-                               size_t               size_per_head,
-                               LlamaAttentionParams attn_params,
-                               NcclParam            tensor_para,
-                               cudaStream_t         stream,
-                               cublasMMWrapper*     cublas_wrapper,
-                               IAllocator*          allocator,
-                               bool                 is_free_buffer_after_forward,
-                               bool                 use_fmha,
-                               int                  cache_block_seq_len,
-                               int                  quant_policy):
+    void allocateBuffer(size_t num_token,
+                        size_t pf_batch_size,
+                        size_t pf_max_q_len,
+                        size_t pf_max_k_len,
+                        size_t dc_batch_size,
+                        size_t dc_max_split_k);
+
+    UnifiedAttentionLayer(size_t               head_num,
+                          size_t               kv_head_num,
+                          size_t               size_per_head,
+                          LlamaAttentionParams attn_params,
+                          NcclParam            tensor_para,
+                          cudaStream_t         stream,
+                          cublasMMWrapper*     cublas_wrapper,
+                          IAllocator*          allocator,
+                          bool                 is_free_buffer_after_forward,
+                          bool                 use_fmha,
+                          int                  cache_block_seq_len,
+                          int                  quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
@@ -65,22 +74,63 @@ public:
        quant_policy_(quant_policy)
    {
        FT_CHECK(head_num % kv_head_num == 0);
+        arch_ = getSMVersion();
    }

-    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
-
-    void fusedMultiHeadAttention(T**    key_cache_ptrs,
-                                 T**    val_cache_ptrs,
-                                 size_t cache_layer_offset,
-                                 T*     attention_mask,
-                                 int*   cu_seqlens,
-                                 int*   context_lengths,
-                                 int    batch_size,
-                                 int    max_q_len,
-                                 int    max_k_len,
-                                 int    max_seq_len);
-
-    void unfusedMultiHeadAttention(T**          key_cache_ptrs,
+    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+
+    void prefill(T*                output,
+                 const T*          qkv,
+                 void**            k_cache_ptrs,
+                 void**            v_cache_ptrs,
+                 const T*          attention_mask,
+                 const int*        cu_seqlens,
+                 const int*        padding_offset,
+                 T**               tmp_k_ptrs,
+                 T**               tmp_v_ptrs,
+                 const int*        input_length,
+                 const int*        context_length,
+                 const int*        cu_block_count,
+                 const float*      rope_theta,
+                 int               pf_batch_size,
+                 int               pf_num_token,
+                 size_t            layer_offset,
+                 int               pf_max_q_len,
+                 int               pf_max_k_len,
+                 int               pf_session_len,
+                 const WeightType* weights);
+
+    void decode(T*                output,
+                const T*          qkv,
+                void**            k_cache_ptrs,
+                void**            v_cache_ptrs,
+                const int*        cu_block_count,
+                const int*        context_length,
+                const bool*       is_finished,
+                const float*      rope_theta,
+                size_t            layer_offset,
+                int               batch_size,
+                int               dc_sum_seq_len,
+                int               dc_max_seq_len,
+                int               max_split_k,
+                const WeightType* weights);
+
+    void fusedMultiHeadAttention(T*       output,
+                                 const T* query,
+                                 T**      key_cache_ptrs,
+                                 T**      val_cache_ptrs,
+                                 size_t   cache_layer_offset,
+                                 T*       attention_mask,
+                                 int*     cu_seqlens,
+                                 int*     context_lengths,
+                                 int      batch_size,
+                                 int      max_q_len,
+                                 int      max_k_len,
+                                 int      max_seq_len);
+
+    void unfusedMultiHeadAttention(T*           output,
+                                   const T*     query,
+                                   T**          key_cache_ptrs,
                                   T**          val_cache_ptrs,
                                   size_t       cache_layer_offset,
                                   const T*     attention_mask,
@@ -116,6 +166,8 @@ private:
    cublasMMWrapper* cublas_wrapper_;
    LlamaLinear<T>   linear_;

+    int arch_{};
+
    T*     qkv_buf_{};
    T*     q_buf_2_{};
    T*     k_buf_2_{};
@@ -126,6 +178,7 @@ private:
    float* qk_buf_float_{};
    T*     qkv_buf_2_{};
    T*     qkv_buf_3_{};
+    float* dc_workspace_{};

    bool is_allocate_buffer_ = false;
 };

--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
+
+#include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/models/llama/llama_decoder_kernels.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+template<typename T>
+void UnifiedDecoder<T>::allocateBuffer(size_t num_token, size_t pf_batch_size, size_t pf_max_q_len, size_t pf_max_k_len)
+{
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    if (pf_batch_size) {
+        attention_mask_ =
+            (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * pf_batch_size * pf_max_q_len * pf_max_k_len, false);
+        padding_offset_ =
+            (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * pf_batch_size * pf_max_q_len, false);
+        cu_seqlens_ = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (pf_batch_size + 1), false);
+    }
+}
+
+template<typename T>
+void UnifiedDecoder<T>::freeBuffer()
+{
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    allocator_->free((void**)&padding_offset_);
+    allocator_->free((void**)&cu_seqlens_);
+    allocator_->free((void**)&attention_mask_);
+    allocator_->free((void**)&h_pinned_token_num_ptr_, true);
+}
+
+template<typename T>
+void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
+                                   size_t                      kv_head_num,
+                                   bool                        use_fmha,
+                                   int                         cache_block_seq_len,
+                                   int                         quant_policy)
+{
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+
+    attn_layer_ = new UnifiedAttentionLayer<T>(head_num_,
+                                               kv_head_num,
+                                               size_per_head_,
+                                               attn_params,
+                                               tensor_para_,
+                                               stream_,
+                                               cublas_wrapper_,
+                                               allocator_,
+                                               is_free_buffer_after_forward_,
+                                               use_fmha,
+                                               cache_block_seq_len,
+                                               quant_policy);
+
+    ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
+                                      size_per_head_,
+                                      inter_size_,
+                                      tensor_para_,
+                                      stream_,
+                                      cublas_wrapper_,
+                                      allocator_,
+                                      is_free_buffer_after_forward_);
+}
+
+template<typename T>
+void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
+                                        TensorMap*                     _outputs,
+                                        const TensorMap*               _inputs,
+                                        size_t                         token_num,
+                                        size_t                         pf_batch_size,
+                                        size_t                         pf_max_q_len,
+                                        size_t                         pf_max_k_len,
+                                        size_t                         dc_batch_size,
+                                        int                            layer_id,
+                                        const LlamaAttentionWeight<T>* weight)
+{
+    TensorMap inputs(*_inputs);
+    inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
+    inputs.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
+    if (pf_batch_size) {
+        inputs.insert("attention_mask",
+                      {MEMORY_GPU, dtype_, {pf_batch_size, 1, pf_max_q_len, pf_max_k_len}, attention_mask_});
+        const size_t pf_token_num = token_num - dc_batch_size;
+        inputs.insert("padding_offset", {MEMORY_GPU, TYPE_INT32, {pf_token_num}, padding_offset_});
+        inputs.insert("cu_seqlens", {MEMORY_GPU, TYPE_INT32, {pf_batch_size + 1}, cu_seqlens_});
+    }
+
+    TensorMap outputs(*_outputs);
+    outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
+
+    attn_layer_->forward(&outputs, &inputs, weight);
+}
+
+template<typename T>
+UnifiedDecoder<T>::~UnifiedDecoder()
+{
+    delete attn_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights)
+{
+    /**
+     * input tensors:
+     *   \param decoder_input [num_token, hidden_units], float
+     *   \param input_lengths [batch_size], int
+     *   \param history_lengths [batch_size], int
+     *   \param context_legnths [batch_size], int
+     *   \param output_norm_weight [hidden_dims], float
+     *   \param max_q_len [1], int on cpu
+     *   \param max_kv_len [1], int on cpu
+     *   \param max_seq_len [1], int on cpu
+     *
+     * output tensors:
+     *   \param decoder_output [num_token, hidden_units],
+     *   \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
+     *   \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+     *   \param last_token_hidden_units [batch_size, hidden_units]
+     */
+
+    // Session sess{};
+
+    const size_t token_num = inputs->at("decoder_input").shape[0];
+
+    const int pf_max_q_len  = inputs->getVal<int>("pf_max_q_len");
+    const int pf_max_k_len  = inputs->getVal<int>("pf_max_k_len");
+    const int pf_batch_size = inputs->getVal<int>("pf_batch_size");
+    const int dc_batch_size = inputs->getVal<int>("dc_batch_size");
+
+    const int* input_length   = inputs->getPtr<int>("input_lengths");
+    const int* context_length = inputs->getPtr<int>("context_lengths");
+
+    T* decoder_input_output = inputs->getPtr<T>("decoder_input");
+    T* decoder_output       = outputs->getPtr<T>("decoder_output");
+
+    T* last_token_hidden_units = outputs->getPtr<T>("last_token_hidden_units");
+
+    allocateBuffer(token_num, pf_batch_size, pf_max_q_len, pf_max_k_len);
+
+    const int pf_offset = dc_batch_size;
+
+    if (pf_batch_size) {
+        FT_CHECK(padding_offset_);
+
+        size_t tmp_token_num{};
+        // `cu_seqlens` is exclusive sum of "input_lengths"
+        invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                           &tmp_token_num,  // updated token num
+                                           padding_offset_,
+                                           cu_seqlens_,
+                                           input_length + pf_offset,
+                                           pf_batch_size,
+                                           pf_max_q_len,
+                                           stream_);
+        sync_check_cuda_error();
+
+        FT_CHECK(tmp_token_num == token_num - dc_batch_size);
+
+        invokeCreateCausalMasks(attention_mask_,
+                                input_length + pf_offset,
+                                context_length + pf_offset,
+                                pf_max_q_len,
+                                pf_max_k_len,
+                                pf_batch_size,
+                                stream_);
+        sync_check_cuda_error();
+    }
+
+    /////////////////////////////////////////////
+    /// RMSNorm
+    invokeRootMeanSquareNorm(decoder_output,
+                             decoder_input_output,
+                             weights->at(0)->self_attn_norm_weights,
+                             rmsnorm_eps_,
+                             token_num,
+                             hidden_units_,
+                             stream_);
+    sync_check_cuda_error();
+
+    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        /////////////////////////////////////////////
+        /// self-attention
+        forwardSelfAttn(decoder_output,
+                        outputs,
+                        inputs,
+                        token_num,
+                        pf_batch_size,
+                        pf_max_q_len,
+                        pf_max_k_len,
+                        dc_batch_size,
+                        layer,
+                        &weights->at(layer)->self_attn_weights);
+
+        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
+                                          decoder_output,
+                                          weights->at(layer)->self_attn_weights.output.bias,
+                                          weights->at(layer)->ffn_norm_weights,
+                                          rmsnorm_eps_,
+                                          token_num,
+                                          hidden_units_,
+                                          stream_);
+        sync_check_cuda_error();
+
+        ////////////////////////////////////////////
+        /// feed-forward network
+        TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
+        TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
+        ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
+
+        const bool is_last_layer = layer == num_layer_ - 1;
+
+        auto scale_weight = !is_last_layer ? weights->at(layer + 1)->self_attn_norm_weights :
+                                             inputs->at("output_norm_weight").getPtr<T>();
+        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
+                                          decoder_output,
+                                          weights->at(layer)->ffn_weights.output.bias,
+                                          scale_weight,
+                                          rmsnorm_eps_,
+                                          token_num,
+                                          hidden_units_,
+                                          stream_);
+        sync_check_cuda_error();
+    }
+
+    if (dc_batch_size) {
+        check_cuda_error(cudaMemcpyAsync(last_token_hidden_units,
+                                         decoder_output,
+                                         sizeof(T) * dc_batch_size * hidden_units_,
+                                         cudaMemcpyDefault,
+                                         stream_));
+    }
+
+    if (pf_batch_size) {
+        invokeGetFeatureOfLastToken(last_token_hidden_units + pf_offset * hidden_units_,  //
+                                    decoder_output + pf_offset * hidden_units_,
+                                    cu_seqlens_,
+                                    hidden_units_,
+                                    pf_batch_size,
+                                    stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+}
+
+template class UnifiedDecoder<float>;
+template class UnifiedDecoder<half>;
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
+#pragma once
+
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/nccl_utils.h"
+
+namespace turbomind {
+
+template<typename T>
+class UnifiedDecoder {
+protected:
+    void allocateBuffer(size_t num_token, size_t pfill_batch_size, size_t pfill_max_q_len, size_t pfill_max_k_len);
+    void freeBuffer();
+
+    void initialize(const LlamaAttentionParams& attn_params,
+                    size_t                      kv_head_num,
+                    bool                        use_fmha,
+                    int                         cache_block_seq_len,
+                    int                         quant_policy);
+
+    cudaStream_t     stream_;
+    cublasMMWrapper* cublas_wrapper_;
+    IAllocator*      allocator_;
+    bool             is_free_buffer_after_forward_{};
+
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+
+    NcclParam tensor_para_;
+
+    T*   attention_mask_{};
+    int* padding_offset_{};
+    int* cu_seqlens_{};  // cu for cumulative
+
+    size_t* h_pinned_token_num_ptr_{};
+
+    UnifiedAttentionLayer<T>* attn_layer_{};
+    LlamaFfnLayer<T>*         ffn_layer_{};
+
+    const DataType dtype_;
+
+    using WeightType = LlamaDecoderLayerWeight<T>;
+
+    void forwardSelfAttn(T*                             attn_io,
+                         TensorMap*                     _outputs,
+                         const TensorMap*               _inputs,
+                         size_t                         token_num,
+                         size_t                         pf_batch_size,
+                         size_t                         pf_max_q_len,
+                         size_t                         pf_max_k_len,
+                         size_t                         dc_batch_size,
+                         int                            layer_id,
+                         const LlamaAttentionWeight<T>* weight);
+
+public:
+    UnifiedDecoder(size_t                      head_num,
+                   size_t                      kv_head_num,
+                   size_t                      size_per_head,
+                   size_t                      inter_size,
+                   size_t                      num_layer,
+                   const LlamaAttentionParams& attn_params,
+                   float                       rmsnorm_eps,
+                   NcclParam                   tensor_para,
+                   cudaStream_t                stream,
+                   cublasMMWrapper*            cublas_wrapper,
+                   IAllocator*                 allocator,
+                   bool                        is_free_buffer_after_forward,
+                   bool                        use_fmha,
+                   int                         cache_block_seq_len,
+                   int                         quant_policy):
+        stream_(stream),
+        cublas_wrapper_(cublas_wrapper),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size),
+        hidden_units_(head_num * size_per_head),
+        num_layer_(num_layer),
+        rmsnorm_eps_(rmsnorm_eps),
+        tensor_para_(tensor_para),
+        dtype_(getTensorType<T>())
+    {
+        initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
+    }
+
+    ~UnifiedDecoder();
+
+    void forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights);
+};
+
+}  // namespace turbomind
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -64,36 +64,42 @@ void LlamaTritonModel<T>::handleMissingParams()
        TM_LOG_WARNING("[LlamaTritonModel] `kv_head_num` is not set, default to `head_num` (%d).", (int)kv_head_num_);
    }

-    if (!max_batch_size_) {
-        max_batch_size_ = 64;
-        TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
+    if (!attn_params_.max_position_embeddings) {
+        attn_params_.max_position_embeddings = 2048;
+        TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
+                       (int)attn_params_.max_position_embeddings);
    }

-    if (!session_len_) {
-        session_len_ = 2160;
-        TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
+    if (!engine_params_.max_batch_size) {
+        engine_params_.max_batch_size = 64;
+        TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.",
+                       (int)engine_params_.max_batch_size);
    }

-    if (!attn_params_.max_position_embeddings) {
-        attn_params_.max_position_embeddings = session_len_;
-        TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to `session_len` (%d).",
-                       (int)attn_params_.max_position_embeddings);
+    if (!engine_params_.session_len) {
+        engine_params_.session_len = attn_params_.max_position_embeddings;
+        TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_params_.session_len);
    }

-    if (!max_context_token_num_) {
-        max_context_token_num_ = (int)std::sqrt(max_batch_size_);
+    if (!engine_params_.max_context_token_num) {
+        engine_params_.max_context_token_num = engine_params_.session_len;
        TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
-                       (int)max_context_token_num_);
+                       (int)engine_params_.max_context_token_num);
    }

-    if (!step_length_) {
-        step_length_ = 1;
-        TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
+    if (engine_params_.max_context_token_num <= engine_params_.max_batch_size) {
+        engine_params_.max_context_token_num *= engine_params_.session_len;
+        TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` = %d.", (int)engine_params_.max_context_token_num);
    }

-    if (!cache_max_block_count_) {
-        cache_max_block_count_ = .95f;
-        TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.", cache_max_block_count_);
+    if (!engine_params_.step_length) {
+        engine_params_.step_length = 1;
+    }
+
+    if (!engine_params_.cache_max_block_count) {
+        engine_params_.cache_max_block_count = .95f;
+        TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.",
+                       engine_params_.cache_max_block_count);
    }

    if (!cache_block_seq_len_) {
@@ -101,9 +107,16 @@ void LlamaTritonModel<T>::handleMissingParams()
        TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.", cache_block_seq_len_);
    }

-    if (!cache_chunk_size_) {
-        cache_chunk_size_ = cache_max_block_count_;
-        TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
+    if (!engine_params_.cache_chunk_size) {
+        engine_params_.cache_chunk_size = engine_params_.cache_max_block_count;
+        TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.",
+                       (int)engine_params_.cache_chunk_size);
+    }
+
+    if (!engine_params_.num_tokens_per_iter) {
+        engine_params_.num_tokens_per_iter = engine_params_.max_context_token_num;
+        TM_LOG_WARNING("[LlamaTritonModel] `num_tokens_per_iter` is not set, default to `max_context_token_num` (%d).",
+                       (int)engine_params_.num_tokens_per_iter);
    }
 }

@@ -142,24 +155,18 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
        }
    }

-    model_name_            = reader.Get("llama", "model_name");
-    head_num_              = reader.GetInteger("llama", "head_num");
-    kv_head_num_           = reader.GetInteger("llama", "kv_head_num", 0);
-    size_per_head_         = reader.GetInteger("llama", "size_per_head");
-    inter_size_            = reader.GetInteger("llama", "inter_size");
-    num_layer_             = reader.GetInteger("llama", "num_layer");
-    vocab_size_            = reader.GetInteger("llama", "vocab_size");
-    norm_eps_              = reader.GetFloat("llama", "norm_eps");
-    start_id_              = reader.GetInteger("llama", "start_id");
-    end_id_                = reader.GetInteger("llama", "end_id");
-    max_batch_size_        = reader.GetInteger("llama", "max_batch_size", 0);
-    max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
-    session_len_           = reader.GetInteger("llama", "session_len", 0);
-    step_length_           = reader.GetInteger("llama", "step_length", 0);
-    cache_max_block_count_ = reader.GetFloat("llama", "cache_max_entry_count", 0);
-    cache_block_seq_len_   = reader.GetInteger("llama", "cache_block_seq_len", 0);
-    cache_chunk_size_      = reader.GetInteger("llama", "cache_chunk_size", 0);
-    use_context_fmha_      = reader.GetInteger("llama", "use_context_fmha", 1);
+    model_name_          = reader.Get("llama", "model_name");
+    head_num_            = reader.GetInteger("llama", "head_num");
+    kv_head_num_         = reader.GetInteger("llama", "kv_head_num", 0);
+    size_per_head_       = reader.GetInteger("llama", "size_per_head");
+    inter_size_          = reader.GetInteger("llama", "inter_size");
+    num_layer_           = reader.GetInteger("llama", "num_layer");
+    vocab_size_          = reader.GetInteger("llama", "vocab_size");
+    norm_eps_            = reader.GetFloat("llama", "norm_eps");
+    start_id_            = reader.GetInteger("llama", "start_id");
+    end_id_              = reader.GetInteger("llama", "end_id");
+    use_context_fmha_    = reader.GetInteger("llama", "use_context_fmha", 1);
+    cache_block_seq_len_ = reader.GetInteger("llama", "cache_block_seq_len", 0);

    attn_bias_    = reader.GetInteger("llama", "attn_bias", 0);
    quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
@@ -173,11 +180,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
    // attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
    attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);

-    handleMissingParams();
+    engine_params_.max_batch_size        = reader.GetInteger("llama", "max_batch_size", 0);
+    engine_params_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0);
+    engine_params_.session_len           = reader.GetInteger("llama", "session_len", 0);
+    engine_params_.step_length           = reader.GetInteger("llama", "step_length", 0);

-    if (max_context_token_num_ <= max_batch_size_) {
-        max_context_token_num_ *= session_len_;
-    }
+    engine_params_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0);
+    engine_params_.cache_chunk_size      = reader.GetInteger("llama", "cache_chunk_size", 0);
+
+    engine_params_.num_tokens_per_iter   = reader.GetInteger("llama", "num_tokens_per_iter", 0);
+    engine_params_.extra_tokens_per_iter = reader.GetInteger("llama", "extra_tokens_per_iter", 0);
+    engine_params_.max_prefill_iters     = reader.GetInteger("llama", "max_prefill_iters", 1);
+
+    handleMissingParams();

    shared_state_          = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
    shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
@@ -258,19 +273,14 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
                                                  inter_size_,
                                                  num_layer_,
                                                  vocab_size_,
-                                                  attn_params_,
                                                  norm_eps_,
-                                                  max_batch_size_,
-                                                  max_context_token_num_,
-                                                  session_len_,
-                                                  step_length_,
+                                                  attn_params_,
                                                  start_id_,
                                                  end_id_,
-                                                  cache_max_block_count_,
                                                  cache_block_seq_len_,
-                                                  cache_chunk_size_,
                                                  quant_policy_,
                                                  use_context_fmha_,
+                                                  engine_params_,
                                                  shared_state_,
                                                  shared_weights_[device_id].get(),
                                                  tensor_para,
@@ -288,7 +298,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
                                          std::move(cuda_device_prop_ptr),
                                          shared_weights_[device_id],
                                          std::move(llama),
-                                          session_len_});
+                                          engine_params_.session_len});
 }

 template<typename T>
@@ -367,10 +377,11 @@ std::string LlamaTritonModel<T>::toString()
    ss << "Model: "
       << "\nhead_num: " << head_num_ << "\nkv_head_num: " << kv_head_num_ << "\nsize_per_head: " << size_per_head_
       << "\ninter_size: " << inter_size_ << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << max_batch_size_
-       << "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
-       << "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_block_count_
-       << "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << cache_chunk_size_
+       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_params_.max_batch_size
+       << "\nmax_context_token_num: " << engine_params_.max_context_token_num
+       << "\nsession_len: " << engine_params_.session_len << "\nstep_length: " << engine_params_.step_length
+       << "\ncache_max_entry_count: " << engine_params_.cache_max_block_count
+       << "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << engine_params_.cache_chunk_size
       << "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
       << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
       << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -89,16 +89,11 @@ private:
    size_t                          num_layer_;
    size_t                          vocab_size_;
    turbomind::LlamaAttentionParams attn_params_;
+    turbomind::EngineParams         engine_params_;
    float                           norm_eps_;
-    int                             max_batch_size_;
-    int                             max_context_token_num_;
-    int                             session_len_;
-    int                             step_length_;
    int                             start_id_;
    int                             end_id_;
-    float                           cache_max_block_count_;
    int                             cache_block_seq_len_;
-    int                             cache_chunk_size_;
    int                             use_context_fmha_;
    size_t                          tensor_para_size_;
    size_t                          pipeline_para_size_;

--- a/src/turbomind/utils/dispatch.h
+++ b/src/turbomind/utils/dispatch.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <utility>
+
+namespace turbomind {
+
+namespace detail {
+
+template<int X>
+inline constexpr std::integral_constant<int, X> _Int{};
+
+template<class F, class P, class G, int... Xs, std::size_t... Is>
+bool dispatch_impl(F&& f, P&& p, G g, std::integer_sequence<int, Xs...>, std::index_sequence<Is...>)
+{
+    constexpr int N = sizeof...(Xs);
+    return (((((P &&) p)(_Int<Xs>) || (g && Is == N - 1)) && (((F &&) f)(_Int<Xs>), 1)) || ...);
+}
+
+}  // namespace detail
+
+template<class F, class P, int... Is, class G = std::true_type>
+bool dispatch(std::integer_sequence<int, Is...> seq, P&& p, F&& f, G g = {})
+{
+    return detail::dispatch_impl((F &&) f, (P &&) p, g, seq, std::make_index_sequence<sizeof...(Is)>{});
+}
+
+template<class F, int... Is, class G = std::true_type>
+bool dispatch(std::integer_sequence<int, Is...> seq, F&& f)
+{
+    return (((F &&) f)(detail::_Int<Is>) || ...);
+}
+
+}  // namespace turbomind