Unify prefill & decode passes (#775)

* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build

Unify prefill & decode passes (#775)
* Unify prefill and decode passes * dynamic split-fuse * refactor * correct input count calculation * remove unused * lint * lint * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build * fix msvc build
7f943a26 · Li Zhang · GitHub · 2ba90822 · 7f943a26 · 7f943a26
Unverified Commit 7f943a26 authored Dec 04, 2023 by Li Zhang Committed by GitHub Dec 04, 2023
15 changed files
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -13,8 +13,8 @@
 namespace turbomind {

 struct Request {
-    uint64_t id;
-    uint64_t priority;
+    uint64_t id;         // sequence id
+    uint64_t unique_id;  // monotonic increasing

    bool start_flag;
    bool end_flag;

--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -36,7 +36,7 @@ SequenceManager::SequenceManager(size_t      layer_num,

 const Sequence* SequenceManager::Create(uint64_t id)
 {
-    Sequence sequence{id, {}, {}, {}, {}, {}, {}, 0.f};
+    Sequence sequence{id};

    auto it = sequences_.find(id);
    if (it != sequences_.end()) {
@@ -152,18 +152,23 @@ struct Schedule {

    int last;

+    int input_count1;
+    int input_count2;
+
    Sequences        active;
    std::vector<int> block_counts;
    Sequences        inactive;
    Sequences        victims;

-    Schedule(Snapshot snapshot, int size):
+    Schedule(Snapshot snapshot, int size, int _input_count1, int _input_count2):
        free(snapshot.free),
        cached(snapshot.cached),
        last(size),
        use_count_(std::move(snapshot.use_count)),
        unlocked_(size),
-        it_(size)
+        it_(size),
+        input_count1(_input_count1),
+        input_count2(_input_count2)
    {
    }

@@ -208,6 +213,7 @@ std::ostream& operator<<(std::ostream& os, const Schedule& s)
 struct Transaction {
    int index_;
    int block_count_;
+    int input_count_;

    int allocate_{};
    int evict_{};
@@ -218,44 +224,45 @@ struct Transaction {
    const Sequences& sequences_;
    Schedule&        schedule_;

-    explicit Transaction(const Sequences& sequences, int index, int block_count, Schedule& sched):
-        sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count)
+    explicit Transaction(const Sequences& sequences, int index, int block_count, int input_count, Schedule& sched):
+        sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count), input_count_(input_count)
    {
    }

    void Process()
    {
-        int count = block_count_;
+        if (schedule_.input_count1 > 0) {
+            int count = block_count_;
+
+            int tmp = std::min(schedule_.free, count);
+            count -= tmp;
+            allocate_ += tmp;

-        int tmp = std::min(schedule_.free, count);
-        count -= tmp;
-        allocate_ += tmp;
+            tmp = std::min(schedule_.cached, count);
+            count -= tmp;
+            evict_ += tmp;

-        tmp = std::min(schedule_.cached, count);
-        count -= tmp;
-        evict_ += tmp;
+            for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
+                if (sequences_[vidx]->status == Sequence::kCached) {
+                    continue;
+                }
+                victims_.push_back(sequences_[vidx]);
+                preempt_ += schedule_.Unlock(sequences_, vidx);

-        for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
-            if (sequences_[vidx]->status == Sequence::kCached) {
-                continue;
+                if (count <= preempt_) {
+                    evict_ += count;
+                    count -= count;
+                    schedule_.last = vidx;  // ! modifiying `sched_.last` is part of commit
+                    break;
+                }
            }
-            victims_.push_back(sequences_[vidx]);
-            preempt_ += schedule_.Unlock(sequences_, vidx);
-
-            if (count <= preempt_) {
-                evict_ += count;
-                count -= count;
-                schedule_.last = vidx;  // ! modifiying `sched_.last` is part of commit
-                break;
+            if (count == 0) {
+                return Commit();
            }
        }

-        if (count == 0) {
-            Commit();
-        }
-        else {
-            schedule_.inactive.push_back(sequences_[index_]);
-        }
+        const_cast<Sequence*>(sequences_[index_])->input_length = 0;
+        schedule_.inactive.push_back(sequences_[index_]);
    }

    void Commit()
@@ -276,6 +283,13 @@ struct Transaction {
        // update active sequences
        schedule_.active.push_back(sequences_[index_]);
        schedule_.block_counts.push_back(block_count_);
+
+        if (input_count_ > schedule_.input_count2) {
+            input_count_ = schedule_.input_count1;
+        }
+        schedule_.input_count1 -= input_count_;
+        schedule_.input_count2 -= input_count_;
+        const_cast<Sequence*>(sequences_[index_])->input_length = input_count_;
    }
 };

@@ -308,6 +322,25 @@ void SequenceManager::SortByPriority(Sequences&                   sequences,
    context_lengths.swap(tmp_lengths);
 }

+// template<class P, class... Ts>
+// void SortByPriority(const std::vector<P>& priorities, Ts&... ranges)
+// {
+//     // sort according to priority
+//     std::vector<int> idxs(priorities.size());
+//     std::iota(idxs.begin(), idxs.end(), 0);
+//     std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {
+//         return priorities[i] < priorities[j];  //
+//     });
+//     auto reorder = [&](auto& src) {
+//         auto dst = src;
+//         for (size_t i = 0; i < idxs.size(); ++i) {
+//             dst[i] = src[idxs[i]];
+//         }
+//         src.swap(dst);
+//     };
+//     (reorder(ranges), ...);
+// }
+
 std::vector<int> SequenceManager::CountRequiredBlocks(const Sequences&        sequences,
                                                      const std::vector<int>& context_lengths,
                                                      int                     step_length)
@@ -344,7 +377,8 @@ void SequenceManager::AssignAndActivate(const Sequences&                 sequenc
 auto SequenceManager::Materialize(Sequences                    sequences,
                                  std::vector<int>             context_lengths,
                                  const std::vector<uint64_t>& priorities,
-                                  int                          step_length) -> Outcome
+                                  int                          step_length,
+                                  AdjustInputCount             adjust) -> Outcome
 {
    ////////////////////////////////////////////////////////////////////////////////
    /// Schedule the assignment of blocks to sequences
@@ -354,18 +388,23 @@ auto SequenceManager::Materialize(Sequences                    sequences,

    SortByPriority(sequences, context_lengths, priorities);

+    // SortByPriority(priorities, sequences, context_lengths);
+
    // Verify and lock cache sequences to avoid their blocks being evicted unnoticed
    // the blocks can still be preempted later
    VerifyAndLockCached(sequences);

+    auto [input_count1, input_count2] = adjust(sequences, context_lengths);
+
    std::vector<int> required = CountRequiredBlocks(sequences, context_lengths, step_length);
    // dbg(required);

-    Schedule schedule(block_manager_->TakeSnapshot(), sequences.size());
+    Schedule schedule(block_manager_->TakeSnapshot(), sequences.size(), input_count1, input_count2);

    // `schedule.last` is decreasing in the loop
    for (int i = 0; i < schedule.last; ++i) {
-        Transaction{sequences, i, required[i], schedule}.Process();
+        const int input_length = context_lengths[i] - sequences[i]->cache_len;
+        Transaction{sequences, i, required[i], input_length, schedule}.Process();
    }

    // mark remaining sequences invalid

--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -3,6 +3,7 @@
 #pragma once

 #include "src/turbomind/models/llama/BlockManager.h"
+#include <functional>

 namespace turbomind {

@@ -16,19 +17,23 @@ struct Sequence {
    };

    uint64_t id;
-    Status   status;
+    Status   status = kCached;

    std::vector<const Block*> blocks;
    std::vector<uint64_t>     block_unique_ids;

+    int input_length = 0;
+
    mutable std::vector<int> tokens;  // update by user

-    mutable int cache_len;
+    mutable int cache_len = 0;

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

-    mutable float rope_theta;
+    mutable float rope_theta = 0.f;
+
+    Sequence(uint64_t _id): id(_id) {}

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
 };
@@ -74,10 +79,13 @@ public:
        int swap_out;
    };

+    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;
+
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
-                                      int                          step_length);
+                                      int                          step_length,
+                                      AdjustInputCount             adjust);

    void* OffsetKey(void* block_ptr)
    {

--- a/src/turbomind/models/llama/copy.h
+++ b/src/turbomind/models/llama/copy.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+class BatchedCopy {
+public:
+    template<class T, std::enable_if_t<alignof(T) <= alignof(uint32_t), int> = 0>
+    T* Add(const T* src, int size, T* dst)
+    {
+        src_.push_back((void*)src);
+        dst_.push_back((void*)dst);
+        size_.push_back(sizeof(T) * size);
+        return dst + size;
+    }
+
+    void Submit(cudaStream_t stream)
+    {
+        invokeBatchedCopy(src_.data(), dst_.data(), size_.data(), size_.size(), stream);
+        sync_check_cuda_error();
+
+        src_.clear();
+        dst_.clear();
+        size_.clear();
+    }
+
+private:
+    std::vector<void*> src_;
+    std::vector<void*> dst_;
+    std::vector<int>   size_;
+};
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/llama_decoder_kernels.cu
+++ b/src/turbomind/models/llama/llama_decoder_kernels.cu
@@ -101,6 +101,8 @@ __device__ T blockReduceSum(const cg::thread_block& block, T value)
    return cg::reduce(tile, value, cg::plus<float>{});
 }

+// r' = r + x
+// x' = norm(r') * scales
 template<typename T>
 __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
                                         T* __restrict__ x_data,

--- a/src/turbomind/models/llama/llama_kernels.cu
+++ b/src/turbomind/models/llama/llama_kernels.cu
@@ -9,11 +9,13 @@
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/dispatch.h"
 #include "src/turbomind/utils/logger.h"
 #include <algorithm>
 #include <cstdint>
 #include <cub/block/block_reduce.cuh>
 #include <type_traits>
+#include <utility>

 namespace turbomind {

@@ -543,8 +545,10 @@ __global__ void gatherOutput(int*       output_ids,
            continue;
        }
        // skip padding for dst
-        const int dst_idx   = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
-        output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+        const int dst_idx = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
+        if (dst_idx < max_output_len) {
+            output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+        }
    }
 }

@@ -694,50 +698,31 @@ void invokeIndexedCopyImpl(void**       h_src_ptr,
                           int          count,
                           cudaStream_t st)
 {
-    auto invoke = [&](auto max_count) {
-        constexpr int C = decltype(max_count)::value;
-        // maximum parameter size: sm<70: 4kB, sm>=70: 32kB
-        static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
-        IndexedCopyParam<N, C> param{};
-        std::copy_n(h_src_ptr, N, param.src_ptr.data());
-        std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
-        std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
-            // Basic alignment check
-            FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
-            return size / sizeof(T);
+    dispatch(  // dispatch for num of copy operations
+        std::integer_sequence<int, 4, 8, 16, 32, 64, 128, 256>{},
+        [&](auto C) { return count <= C; },
+        [&](auto C) {
+            // maximum parameter size: sm<70: 4kB, sm>=70: 32kB
+            static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
+            IndexedCopyParam<N, C> param{};
+            std::copy_n(h_src_ptr, N, param.src_ptr.data());
+            std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
+            std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
+                // Basic alignment check
+                FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
+                return size / sizeof(T);
+            });
+            param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
+            auto copy_idx    = [](const int* src, int offset, int n, auto dst) {
+                return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
+            };
+            for (int c = 0; c < count; c += C) {
+                int batch_size = std::min(count - c, (int)C);
+                copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
+                copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
+                indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
+            }
        });
-        param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
-        auto copy_idx    = [](const int* src, int offset, int n, auto dst) {
-            return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
-        };
-        for (int c = 0; c < count; c += C) {
-            int batch_size = std::min(count - c, C);
-            copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
-            copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
-            indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
-        }
-    };
-    if (count <= 4) {
-        invoke(std::integral_constant<int, 4>{});
-    }
-    if (count <= 8) {
-        invoke(std::integral_constant<int, 8>{});
-    }
-    else if (count <= 16) {
-        invoke(std::integral_constant<int, 16>{});
-    }
-    else if (count <= 32) {
-        invoke(std::integral_constant<int, 32>{});
-    }
-    else if (count <= 64) {
-        invoke(std::integral_constant<int, 64>{});
-    }
-    else if (count <= 128) {
-        invoke(std::integral_constant<int, 128>{});
-    }
-    else {
-        invoke(std::integral_constant<int, 256>{});
-    }
 }

 void invokeIndexedCopy(void**       h_src_ptr,
@@ -749,19 +734,14 @@ void invokeIndexedCopy(void**       h_src_ptr,
                       int          n_copys,
                       cudaStream_t st)
 {
-    auto args = std::tuple{h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st};
-    switch (n_copys) {
-        case 1:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 1>, args);
-        case 2:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 2>, args);
-        case 3:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 3>, args);
-        case 4:
-            return std::apply(invokeIndexedCopyImpl<uint32_t, 4>, args);
-        default:
-            FT_CHECK(0);
-    }
+    auto success = dispatch(std::integer_sequence<int, 1, 2, 3, 4>{}, [&](auto N) {
+        if (N == n_copys) {
+            invokeIndexedCopyImpl<uint32_t, N>(h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st);
+            return true;
+        }
+        return false;
+    });
+    FT_CHECK(success);
 }

 __global__ void padLastTokenIds(int* token_ids, const int* context_length, int max_context_len, int batch_size)
@@ -777,6 +757,96 @@ void invokePadLastTokenIds(
    padLastTokenIds<<<1, 512, 0, stream>>>(token_ids, context_length, max_context_len, batch_size);
 }

+template<typename T>
+__global__ void getFeatureOfLastToken(T* output, const T* input, const int* cu_seqlens, int dims)
+{
+    int bi = blockIdx.x;
+    int ti = cu_seqlens[bi + 1] - 1;
+    for (int i = threadIdx.x; i < dims; i += blockDim.x) {
+        output[dims * bi + i] = input[dims * ti + i];
+    }
+}
+
+template<typename T>
+void invokeGetFeatureOfLastToken(
+    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream)
+{
+    getFeatureOfLastToken<<<batch_size, 256, 0, stream>>>(output, input, cu_seqlens, dims);
+}
+
+template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, int, cudaStream_t);
+template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t);
+
+template<class T, int C>
+struct BatchedCopyParam {
+    Array<T*, C>  src_ptr;
+    Array<T*, C>  dst_ptr;
+    Array<int, C> size;
+    int           count;
+};
+
+template<int kThrPerCpy, class T, int C>
+__global__ void batchedCopy(BatchedCopyParam<T, C> param)
+{
+    const int ti = threadIdx.x + blockIdx.x * blockDim.x;
+    const int bi = ti / kThrPerCpy;
+    if (bi >= param.count) {
+        return;
+    }
+    const T* __restrict__ src = param.src_ptr[bi];
+    T* __restrict__ dst       = param.dst_ptr[bi];
+    int size                  = param.size[bi];
+    for (int i = ti % kThrPerCpy; i < size; i += kThrPerCpy) {
+        dst[i] = src[i];
+    }
+}
+
+// MSVC does not like CUDA kernel launch inside nested lambdas
+template<class P>
+struct BatchedCopyLauncher {
+    int          max_size;
+    int          count;
+    const P*     params;
+    cudaStream_t st;
+
+    template<int S>
+    void operator()(std::integral_constant<int, S>) const
+    {
+        constexpr int threads         = 128;
+        constexpr int items_per_block = threads / S;
+        const int     blocks          = (count + items_per_block - 1) / items_per_block;
+        batchedCopy<S><<<blocks, threads, 0, st>>>(*params);
+    }
+};
+
+void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st)
+{
+    dispatch(
+        std::integer_sequence<int, 1, 8, 32, 128>{},
+        [&](auto C) { return count <= C; },
+        [&](auto C) {
+            using T = uint32_t;
+            BatchedCopyParam<T, C> params{};
+            // TODO: on CUDA 12.1 and sm_70+ this can be 32K
+            static_assert(sizeof(params) <= 4096);
+            for (int c = 0; c < count; c += C) {
+                const int bsz = std::min<int>(count - c, C);
+                params.count  = bsz;
+                for (int i = 0; i < bsz; ++i) {
+                    params.src_ptr[i] = (T*)src_ptr[c + i];
+                    params.dst_ptr[i] = (T*)dst_ptr[c + i];
+                    FT_CHECK(size[c + i] % sizeof(T) == 0);
+                    params.size[i] = size[c + i] / sizeof(T);
+                }
+                const int max_size = *std::max_element(params.size.begin(), params.size.end());
+                dispatch(
+                    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, 64, 128>{},
+                    [&](auto S) { return max_size <= S; },
+                    BatchedCopyLauncher<BatchedCopyParam<T, C>>{max_size, count, &params, st});
+            }
+        });
+}
+
 #define VERSION_SWITCH(VERSION, CONST_NAME, ...)                                                                       \
    [&] {                                                                                                              \
        if (VERSION == 2) {                                                                                            \

--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -105,6 +105,8 @@ void invokeIndexedCopy(void**       h_src_ptr,
                       int          n_copys,
                       cudaStream_t st);

+void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st);
+
 // ABCDe            ABCDe     e
 // ABCDEFGHIJk      ABCDEFGHIJk
 // ABCDEFGHi    ->  ABCDEFGHi i
@@ -113,6 +115,10 @@ void invokeIndexedCopy(void**       h_src_ptr,
 void invokePadLastTokenIds(
    int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream);

+template<typename T>
+void invokeGetFeatureOfLastToken(
+    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream);
+
 void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);

 template<typename T>

--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -13,4 +13,21 @@ struct LlamaAttentionParams {
    bool use_logn_attn;
 };

+struct EngineParams {
+    // batch params
+    int max_batch_size;
+    int session_len;
+    int step_length;
+
+    // cache params
+    float cache_max_block_count;
+    int   cache_chunk_size;
+
+    // chunking params
+    int max_context_token_num;
+    int num_tokens_per_iter;
+    int extra_tokens_per_iter;
+    int max_prefill_iters;
+};
+
 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -25,28 +25,37 @@
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"

 namespace turbomind {

 template<typename T>
-class LlamaContextAttentionLayer {
+class UnifiedAttentionLayer {
 public:
+    using WeightType                      = LlamaAttentionWeight<T>;
+    static constexpr int kDecodeMaxSplits = 16;
+
    void freeBuffer();
-    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
-
-    LlamaContextAttentionLayer(size_t               head_num,
-                               size_t               kv_head_num,
-                               size_t               size_per_head,
-                               LlamaAttentionParams attn_params,
-                               NcclParam            tensor_para,
-                               cudaStream_t         stream,
-                               cublasMMWrapper*     cublas_wrapper,
-                               IAllocator*          allocator,
-                               bool                 is_free_buffer_after_forward,
-                               bool                 use_fmha,
-                               int                  cache_block_seq_len,
-                               int                  quant_policy):
+    void allocateBuffer(size_t num_token,
+                        size_t pf_batch_size,
+                        size_t pf_max_q_len,
+                        size_t pf_max_k_len,
+                        size_t dc_batch_size,
+                        size_t dc_max_split_k);
+
+    UnifiedAttentionLayer(size_t               head_num,
+                          size_t               kv_head_num,
+                          size_t               size_per_head,
+                          LlamaAttentionParams attn_params,
+                          NcclParam            tensor_para,
+                          cudaStream_t         stream,
+                          cublasMMWrapper*     cublas_wrapper,
+                          IAllocator*          allocator,
+                          bool                 is_free_buffer_after_forward,
+                          bool                 use_fmha,
+                          int                  cache_block_seq_len,
+                          int                  quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
@@ -65,22 +74,63 @@ public:
        quant_policy_(quant_policy)
    {
        FT_CHECK(head_num % kv_head_num == 0);
+        arch_ = getSMVersion();
    }

-    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
-
-    void fusedMultiHeadAttention(T**    key_cache_ptrs,
-                                 T**    val_cache_ptrs,
-                                 size_t cache_layer_offset,
-                                 T*     attention_mask,
-                                 int*   cu_seqlens,
-                                 int*   context_lengths,
-                                 int    batch_size,
-                                 int    max_q_len,
-                                 int    max_k_len,
-                                 int    max_seq_len);
-
-    void unfusedMultiHeadAttention(T**          key_cache_ptrs,
+    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+
+    void prefill(T*                output,
+                 const T*          qkv,
+                 void**            k_cache_ptrs,
+                 void**            v_cache_ptrs,
+                 const T*          attention_mask,
+                 const int*        cu_seqlens,
+                 const int*        padding_offset,
+                 T**               tmp_k_ptrs,
+                 T**               tmp_v_ptrs,
+                 const int*        input_length,
+                 const int*        context_length,
+                 const int*        cu_block_count,
+                 const float*      rope_theta,
+                 int               pf_batch_size,
+                 int               pf_num_token,
+                 size_t            layer_offset,
+                 int               pf_max_q_len,
+                 int               pf_max_k_len,
+                 int               pf_session_len,
+                 const WeightType* weights);
+
+    void decode(T*                output,
+                const T*          qkv,
+                void**            k_cache_ptrs,
+                void**            v_cache_ptrs,
+                const int*        cu_block_count,
+                const int*        context_length,
+                const bool*       is_finished,
+                const float*      rope_theta,
+                size_t            layer_offset,
+                int               batch_size,
+                int               dc_sum_seq_len,
+                int               dc_max_seq_len,
+                int               max_split_k,
+                const WeightType* weights);
+
+    void fusedMultiHeadAttention(T*       output,
+                                 const T* query,
+                                 T**      key_cache_ptrs,
+                                 T**      val_cache_ptrs,
+                                 size_t   cache_layer_offset,
+                                 T*       attention_mask,
+                                 int*     cu_seqlens,
+                                 int*     context_lengths,
+                                 int      batch_size,
+                                 int      max_q_len,
+                                 int      max_k_len,
+                                 int      max_seq_len);
+
+    void unfusedMultiHeadAttention(T*           output,
+                                   const T*     query,
+                                   T**          key_cache_ptrs,
                                   T**          val_cache_ptrs,
                                   size_t       cache_layer_offset,
                                   const T*     attention_mask,
@@ -116,6 +166,8 @@ private:
    cublasMMWrapper* cublas_wrapper_;
    LlamaLinear<T>   linear_;

+    int arch_{};
+
    T*     qkv_buf_{};
    T*     q_buf_2_{};
    T*     k_buf_2_{};
@@ -126,6 +178,7 @@ private:
    float* qk_buf_float_{};
    T*     qkv_buf_2_{};
    T*     qkv_buf_3_{};
+    float* dc_workspace_{};

    bool is_allocate_buffer_ = false;
 };

--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
+#pragma once
+
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/nccl_utils.h"
+
+namespace turbomind {
+
+template<typename T>
+class UnifiedDecoder {
+protected:
+    void allocateBuffer(size_t num_token, size_t pfill_batch_size, size_t pfill_max_q_len, size_t pfill_max_k_len);
+    void freeBuffer();
+
+    void initialize(const LlamaAttentionParams& attn_params,
+                    size_t                      kv_head_num,
+                    bool                        use_fmha,
+                    int                         cache_block_seq_len,
+                    int                         quant_policy);
+
+    cudaStream_t     stream_;
+    cublasMMWrapper* cublas_wrapper_;
+    IAllocator*      allocator_;
+    bool             is_free_buffer_after_forward_{};
+
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+
+    NcclParam tensor_para_;
+
+    T*   attention_mask_{};
+    int* padding_offset_{};
+    int* cu_seqlens_{};  // cu for cumulative
+
+    size_t* h_pinned_token_num_ptr_{};
+
+    UnifiedAttentionLayer<T>* attn_layer_{};
+    LlamaFfnLayer<T>*         ffn_layer_{};
+
+    const DataType dtype_;
+
+    using WeightType = LlamaDecoderLayerWeight<T>;
+
+    void forwardSelfAttn(T*                             attn_io,
+                         TensorMap*                     _outputs,
+                         const TensorMap*               _inputs,
+                         size_t                         token_num,
+                         size_t                         pf_batch_size,
+                         size_t                         pf_max_q_len,
+                         size_t                         pf_max_k_len,
+                         size_t                         dc_batch_size,
+                         int                            layer_id,
+                         const LlamaAttentionWeight<T>* weight);
+
+public:
+    UnifiedDecoder(size_t                      head_num,
+                   size_t                      kv_head_num,
+                   size_t                      size_per_head,
+                   size_t                      inter_size,
+                   size_t                      num_layer,
+                   const LlamaAttentionParams& attn_params,
+                   float                       rmsnorm_eps,
+                   NcclParam                   tensor_para,
+                   cudaStream_t                stream,
+                   cublasMMWrapper*            cublas_wrapper,
+                   IAllocator*                 allocator,
+                   bool                        is_free_buffer_after_forward,
+                   bool                        use_fmha,
+                   int                         cache_block_seq_len,
+                   int                         quant_policy):
+        stream_(stream),
+        cublas_wrapper_(cublas_wrapper),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size),
+        hidden_units_(head_num * size_per_head),
+        num_layer_(num_layer),
+        rmsnorm_eps_(rmsnorm_eps),
+        tensor_para_(tensor_para),
+        dtype_(getTensorType<T>())
+    {
+        initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
+    }
+
+    ~UnifiedDecoder();
+
+    void forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights);
+};
+
+}  // namespace turbomind
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -89,16 +89,11 @@ private:
    size_t                          num_layer_;
    size_t                          vocab_size_;
    turbomind::LlamaAttentionParams attn_params_;
+    turbomind::EngineParams         engine_params_;
    float                           norm_eps_;
-    int                             max_batch_size_;
-    int                             max_context_token_num_;
-    int                             session_len_;
-    int                             step_length_;
    int                             start_id_;
    int                             end_id_;
-    float                           cache_max_block_count_;
    int                             cache_block_seq_len_;
-    int                             cache_chunk_size_;
    int                             use_context_fmha_;
    size_t                          tensor_para_size_;
    size_t                          pipeline_para_size_;

--- a/src/turbomind/utils/dispatch.h
+++ b/src/turbomind/utils/dispatch.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <utility>
+
+namespace turbomind {
+
+namespace detail {
+
+template<int X>
+inline constexpr std::integral_constant<int, X> _Int{};
+
+template<class F, class P, class G, int... Xs, std::size_t... Is>
+bool dispatch_impl(F&& f, P&& p, G g, std::integer_sequence<int, Xs...>, std::index_sequence<Is...>)
+{
+    constexpr int N = sizeof...(Xs);
+    return (((((P &&) p)(_Int<Xs>) || (g && Is == N - 1)) && (((F &&) f)(_Int<Xs>), 1)) || ...);
+}
+
+}  // namespace detail
+
+template<class F, class P, int... Is, class G = std::true_type>
+bool dispatch(std::integer_sequence<int, Is...> seq, P&& p, F&& f, G g = {})
+{
+    return detail::dispatch_impl((F &&) f, (P &&) p, g, seq, std::make_index_sequence<sizeof...(Is)>{});
+}
+
+template<class F, int... Is, class G = std::true_type>
+bool dispatch(std::integer_sequence<int, Is...> seq, F&& f)
+{
+    return (((F &&) f)(detail::_Int<Is>) || ...);
+}
+
+}  // namespace turbomind