TurboMind 2 (#590)

* refresh decoder attention kernel * block-level kv cache * `BlockManager` & `SequenceManager` * update * update * update * update * rename * GQA support * fix context length * GQA dispatch * kv8 * tune * async stream cb * nvtx * config parsing * debug * optimize output cost * split-k decoding * minor * truncate `session_len` by available blocks * minor * license * fix * dispatch `cp.async` * fix linking * fix * fix deadlock * guard input length * correct start offset * fix prefill chunking * fix `cache_block_seq_len` param passing * fix `block_size` fmtstr * fix output tokens * fix batch resizing * fix masking of finished sequences * add debug util * free unused block early * add ntk scaling and logn scaling * cmake flags * fix typo * w4a16 for sm75 * fix msvc build * fix msvc build * fix block verification * fix msvc build * use `std::shuffle` * fix lint * fix lint * fix lint * clear incoming buffer * clear finished requests * fix batch initialization * fix typo * fix typo * fix comparison

TurboMind 2 (#590)
* refresh decoder attention kernel * block-level kv cache * `BlockManager` & `SequenceManager` * update * update * update * update * rename * GQA support * fix context length * GQA dispatch * kv8 * tune * async stream cb * nvtx * config parsing * debug * optimize output cost * split-k decoding * minor * truncate `session_len` by available blocks * minor * license * fix * dispatch `cp.async` * fix linking * fix * fix deadlock * guard input length * correct start offset * fix prefill chunking * fix `cache_block_seq_len` param passing * fix `block_size` fmtstr * fix output tokens * fix batch resizing * fix masking of finished sequences * add debug util * free unused block early * add ntk scaling and logn scaling * cmake flags * fix typo * w4a16 for sm75 * fix msvc build * fix msvc build * fix block verification * fix msvc build * use `std::shuffle` * fix lint * fix lint * fix lint * clear incoming buffer * clear finished requests * fix batch initialization * fix typo * fix typo * fix comparison
ab1767cf · Li Zhang · GitHub · 06125966 · ab1767cf · ab1767cf
Unverified Commit ab1767cf authored Nov 10, 2023 by Li Zhang Committed by GitHub Nov 10, 2023
20 changed files
--- a/src/turbomind/kernels/gemm_s_f16/gemm_template.h
+++ b/src/turbomind/kernels/gemm_s_f16/gemm_template.h
@@ -9,6 +9,23 @@

 namespace turbomind {

+__inline__ __device__ void
+mma_m16n8k8_row_col(Array<float, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<float, 4>& c)
+{
+#if TURBOMIND_ARCH_SM75
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    float const*    C = reinterpret_cast<float const*>(&c);
+    float*          D = reinterpret_cast<float*>(&d);
+    asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
+        "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
 __inline__ __device__ void
 mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<float, 4>& c)
 {
@@ -22,7 +39,10 @@ mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<ha
        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
 #else
-    assert(TURBOMIND_ARCH_SM80);
+    const Array<half, 4>* _a = (const Array<half, 4>*)&a;
+    const Array<half, 2>* _b = (const Array<half, 2>*)&b;
+    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
+    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
 #endif
 }


--- a/src/turbomind/kernels/unfused_attention_kernels.cu
+++ b/src/turbomind/kernels/unfused_attention_kernels.cu
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/turbomind/kernels/decoder_multihead_attention/array_ops.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
@@ -854,19 +854,20 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
                                                   T* v_buf,
                                                   T* QKV,
                                                   const T* __restrict qkv_bias,
-                                                   const int* padding_offset,
-                                                   const int* history_length,
-                                                   const int* input_length,
-                                                   int        batch_size,
-                                                   int        seq_len,
-                                                   int        head_num,
-                                                   int        kv_head_num,
-                                                   int        size_per_head,
-                                                   int        rotary_embedding_dim,
-                                                   float      rotary_embedding_base,
-                                                   int        max_position_embeddings,
-                                                   bool       use_dynamic_ntk,
-                                                   bool       use_logn_attn)
+                                                   const int*   padding_offset,
+                                                   const int*   context_length,
+                                                   const int*   input_length,
+                                                   const float* rope_theta,
+                                                   int          batch_size,
+                                                   int          seq_len,
+                                                   int          head_num,
+                                                   int          kv_head_num,
+                                                   int          size_per_head,
+                                                   int          rotary_embedding_dim,
+                                                   float        rotary_embedding_base,
+                                                   int          max_position_embeddings,
+                                                   bool         use_dynamic_ntk,
+                                                   bool         use_logn_attn)
 {
    // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, head_num, size_per_head], and
    // QKV split to 3 split buffer q, k, v and transpose them to [batch_size, head_num, seq_len, size_per_head].
@@ -907,12 +908,18 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
    Vec_t q, k, v;
    Vec_t q_bias, k_bias, v_bias;

+    using Vec = Array<T, vec_size>;
+
+    static_assert(sizeof(Vec_t) == sizeof(Vec));
+
+    using namespace ops;
+
    // load Q and apply bias
    if (!is_masked) {
        q = *reinterpret_cast<const Vec_t*>(&QKV[src_q_idx]);
        if (qkv_bias) {
-            q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
-            q      = mmha::add(q, q_bias);
+            q_bias  = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
+            (Vec&)q = (Vec&)q + (Vec&)q_bias;
        }
    }

@@ -921,35 +928,32 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
        k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
        v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
        if (qkv_bias) {
-            k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + k_offset]);
-            v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + v_offset]);
-            k      = mmha::add(k, k_bias);
-            v      = mmha::add(v, v_bias);
+            k_bias  = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + k_offset]);
+            v_bias  = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + v_offset]);
+            (Vec&)k = (Vec&)k + (Vec&)k_bias;
+            (Vec&)v = (Vec&)v + (Vec&)v_bias;
        }
    }

-    const int history_len = history_length[batch_idx];
-    const int context_len = history_len + input_length[batch_idx];
+    const int context_len = context_length[batch_idx];
+    const int history_len = context_len - input_length[batch_idx];
    const int timestep    = history_len + seq_idx;

-    if (use_dynamic_ntk) {
-        rotary_embedding_base = mmha::rotary_embedding_get_base(
-            context_len, max_position_embeddings, rotary_embedding_dim, rotary_embedding_base);
+    if (rope_theta) {
+        rotary_embedding_base = rope_theta[batch_idx];
    }

-    // TODO: unused computation on k if GQA is used
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_embedding_base, timestep);
+    RotaryEmbedding<vec_size> rotary_emb(rotary_embedding_base, rotary_embedding_dim, timestep, {tidx * vec_size, 0});
+    rotary_emb.apply((Array<T, vec_size>&)q);
+
+    if (head_idx < kv_head_num) {
+        rotary_emb.apply((Array<T, vec_size>&)k);
+    }

    if (use_logn_attn) {
        // +1 to convert to context length at the timestep
-        float logn_scaling = mmha::logn_attn_get_scaling(timestep + 1, max_position_embeddings);
-        if constexpr (std::is_same_v<T, float>) {
-            q = mmha::mul<Vec_t, float, Vec_t>(logn_scaling, q);
-        }
-        else if constexpr (std::is_same_v<T, half>) {
-            half tmp = __float2half(logn_scaling);
-            q        = mmha::mul<Vec_t, uint16_t, Vec_t>((uint16_t&)tmp, q);
-        }
+        LogNScaling logn_scaling(timestep + 1, max_position_embeddings);
+        logn_scaling.apply((Array<T, vec_size>&)q);
    }

    if (!is_masked && !q_buf) {  // also skip modifying QKV if q/k/v_buf are present
@@ -982,8 +986,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
                                                                                             QKV,                      \
                                                                                             qkv_bias,                 \
                                                                                             padding_offset,           \
-                                                                                             history_length,           \
+                                                                                             context_length,           \
                                                                                             input_length,             \
+                                                                                             rope_theta,               \
                                                                                             batch_size,               \
                                                                                             seq_len,                  \
                                                                                             head_num,                 \
@@ -1002,8 +1007,9 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                    T*           QKV,
                                    const T*     qkv_bias,
                                    const int*   padding_offset,
-                                    const int*   history_length,
+                                    const int*   context_length,
                                    const int*   input_length,
+                                    const float* rope_theta,
                                    const int    batch_size,
                                    const int    seq_len,
                                    const int    token_num,
@@ -1034,6 +1040,7 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                                 const int*   padding_offset,                                          \
                                                 const int*   history_length,                                          \
                                                 const int*   input_length,                                            \
+                                                 const float* rope_theta,                                              \
                                                 const int    batch_size,                                              \
                                                 const int    seq_len,                                                 \
                                                 const int    token_num,                                               \

--- a/src/turbomind/kernels/unfused_attention_kernels.h
+++ b/src/turbomind/kernels/unfused_attention_kernels.h
@@ -70,8 +70,9 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                    T*           QKV,
                                    const T*     qkv_bias,
                                    const int*   padding_offset,
-                                    const int*   history_length,
+                                    const int*   context_length,
                                    const int*   input_length,
+                                    const float* rope_theta,
                                    const int    batch_size,
                                    const int    seq_len,
                                    const int    token_num,

--- a/src/turbomind/models/llama/Barrier.h
+++ b/src/turbomind/models/llama/Barrier.h
@@ -2,6 +2,7 @@

 #pragma once

+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #ifndef _MSC_VER
 #include <pthread.h>

--- a/src/turbomind/models/llama/BlockManager.cc
+++ b/src/turbomind/models/llama/BlockManager.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/llama/BlockManager.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/debug_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include <algorithm>
+#include <iterator>
+#include <stdexcept>
+
+namespace turbomind {
+
+BlockManager::BlockManager(size_t block_size, double block_count, int chunk_size, IAllocator* allocator):
+    block_size_(block_size), allocator_(allocator)
+{
+    if (block_count < 1.) {
+        max_block_count_ = GetBlockCount(block_size, block_count);
+    }
+    else {
+        max_block_count_ = block_count;
+    }
+
+    if (chunk_size == 0) {
+        chunk_size_ = static_cast<int>(std::sqrt(max_block_count_));
+    }
+    else if (chunk_size < 0) {
+        chunk_size_ = max_block_count_;
+    }
+    else {
+        chunk_size_ = chunk_size;
+    }
+
+    TM_LOG_INFO("[BlockManager] block_size = %lu MB", (unsigned long)block_size_ >> 20);
+    TM_LOG_INFO("[BlockManager] max_block_count = %d", max_block_count_);
+    TM_LOG_INFO("[BlockManager] chunk_size = %d", chunk_size_);
+
+    blocks_.reserve(max_block_count_);
+
+    active_ids_.reserve(max_block_count_);
+    cached_ids_.reserve(max_block_count_);
+    free_ids_.reserve(max_block_count_);
+
+    // pre-allocate first chunk
+    Malloc();
+    dbg(free_ids_);
+}
+
+BlockManager::~BlockManager()
+{
+    for (auto& chunk : chunks_) {
+        allocator_->free(&chunk);
+    }
+}
+
+bool BlockManager::Malloc()
+{
+    auto chunk_size = std::min<int>(chunk_size_, max_block_count_ - blocks_.size());
+
+    if (!chunk_size) {
+        return false;
+    }
+
+    auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size);
+    if (!ptr) {
+        return false;
+    }
+
+    chunks_.push_back(ptr);
+
+    for (int i = 0; i < chunk_size; ++i, ptr += block_size_) {
+        auto& block     = blocks_.emplace_back();
+        block.use_count = 0;
+        block.ref_count = 0;
+        block.id        = (int)blocks_.size() - 1;
+        block.timestamp = 0;
+        block.data      = ptr;
+
+        free_ids_.push_back(block.id);
+    }
+
+    return true;
+}
+
+size_t BlockManager::GetBlockCount(size_t block_size, double ratio)
+{
+    size_t free{};
+    size_t total{};
+    check_cuda_error(cudaMemGetInfo(&free, &total));
+    return static_cast<size_t>(total * ratio) / block_size;
+}
+
+void BlockManager::Move(std::vector<int>& src, const std::vector<int>& delta, std::vector<int>& dst)
+{
+    std::vector<int> src1(src.size() - delta.size());
+    std::set_difference(src.begin(), src.end(), delta.begin(), delta.end(), src1.begin());
+    src.swap(src1);
+
+    std::vector<int> dst1(dst.size() + delta.size());
+    std::set_union(dst.begin(), dst.end(), delta.begin(), delta.end(), dst1.begin());
+    dst.swap(dst1);
+}
+
+std::vector<const Block*> BlockManager::Allocate(int count)
+{
+    while (free_ids_.size() < count) {
+        if (!Malloc()) {
+            throw std::runtime_error("out of memory");
+        }
+    }
+
+    std::vector<const Block*> ret;
+
+    std::vector<int> idxs(count);
+
+    for (int i = 0; i < count; ++i) {
+        int idx     = free_ids_[i];
+        idxs[i]     = idx;
+        auto& block = blocks_[idx];
+        FT_CHECK(is_free(block));
+        block.ref_count = 1;
+        block.use_count = 1;
+        block.unique_id = unique_id_++;
+        ret.push_back(&block);
+    }
+
+    Move(free_ids_, idxs, active_ids_);
+
+    dbg(free_ids_, active_ids_);
+
+    return ret;
+}
+
+void BlockManager::Evict(int count)
+{
+    std::vector<int> idxs(cached_ids_);
+    // get first `count` cached ids according to timestamp
+    std::nth_element(idxs.begin(), idxs.begin() + count, idxs.end(), [&](int i, int j) {
+        return blocks_[i].timestamp < blocks_[j].timestamp;
+    });
+    idxs.resize(count);
+
+    // sort the retrieved ids
+    std::sort(idxs.begin(), idxs.end());
+
+    // set as free
+    for (const auto& idx : idxs) {
+        auto& b = blocks_[idx];
+        FT_CHECK(is_cached(b));
+        b.ref_count = 0;
+        b.unique_id = 0;
+        b.timestamp = 0;
+    }
+
+    Move(cached_ids_, idxs, free_ids_);
+
+    dbg(cached_ids_, free_ids_);
+}
+
+int BlockManager::Free(const std::vector<const Block*>& bs)
+{
+    std::vector<int> idxs;
+
+    for (const auto& p : bs) {
+        auto& b = blocks_[p->id];
+        FT_CHECK(is_cached(b));
+        if (--b.ref_count == 0) {
+            b.unique_id = 0;
+            b.timestamp = 0;
+            idxs.push_back(b.id);
+        }
+    }
+
+    std::sort(idxs.begin(), idxs.end());
+
+    Move(cached_ids_, idxs, free_ids_);
+
+    dbg(cached_ids_, free_ids_);
+
+    return idxs.size();
+}
+
+int BlockManager::Unlock(const std::vector<const Block*>& bs)
+{
+    std::vector<int> idxs;
+
+    for (const auto& p : bs) {
+        auto& block = blocks_[p->id];
+        FT_CHECK(is_active(block));
+        if (--block.use_count == 0) {
+            idxs.push_back(block.id);
+        }
+    }
+
+    std::sort(idxs.begin(), idxs.end());
+
+    Move(active_ids_, idxs, cached_ids_);
+
+    dbg(active_ids_, cached_ids_);
+
+    return idxs.size();
+}
+
+int BlockManager::Lock(const std::vector<const Block*>& bs)
+{
+    std::vector<int> idxs;
+
+    for (const auto& p : bs) {
+        auto& block = blocks_[p->id];
+        FT_CHECK(is_cached(block));
+        if (++block.use_count == 1) {
+            idxs.push_back(p->id);
+        }
+    }
+
+    std::sort(idxs.begin(), idxs.end());
+
+    Move(cached_ids_, idxs, active_ids_);
+
+    // dbg(cached_ids_, active_ids_);
+
+    return idxs.size();
+}
+
+void BlockManager::Touch(const std::vector<const Block*>& bs)
+{
+    std::for_each(bs.crbegin(), bs.crend(), [this](const Block* p) {
+        FT_CHECK(is_active(*p));
+        const_cast<Block*>(p)->timestamp = timestamp_++;
+    });
+}
+
+Snapshot BlockManager::TakeSnapshot()
+{
+    std::vector<int> use_count(blocks_.size());
+    for (const auto& idx : active_ids_) {
+        use_count[idx] = blocks_[idx].use_count;
+    }
+    return {active_count(), cached_count(), free_count(), std::move(use_count)};
+}
+
+std::ostream& operator<<(std::ostream& os, const BlockManager& manager)
+{
+    os << "block_size: " << manager.block_size_ << ", ";
+    os << "max_block_count: " << manager.max_block_count_ << ", ";
+    os << "chunk_size: " << manager.chunk_size_ << ", ";
+    os << "chunks: " << manager.chunks_.size() << ", ";
+    os << "active_ids: " << manager.active_ids_.size() << ", ";
+    os << "cached_ids: " << manager.cached_ids_.size() << ", ";
+    os << "free_ids: " << manager.free_ids_.size() << ", ";
+    os << "blocks: " << manager.blocks_.size() << ", ";
+    os << "unique_id: " << manager.unique_id_ << ", ";
+    os << "timestamp: " << manager.timestamp_ << ", ";
+    os << "allocator: " << manager.allocator_;
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Block& block)
+{
+    os << "id=" << block.id << ", use_count=" << block.use_count << ", unique_id=" << block.unique_id
+       << ", timestamp=" << block.timestamp << ", data=" << block.data;
+    return os;
+}
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/BlockManager.h
+++ b/src/turbomind/models/llama/BlockManager.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include <algorithm>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <iterator>
+#include <numeric>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+namespace turbomind {
+
+// [L, H, S, D]
+
+// [L, S/x, H, x, D]
+
+struct Block {
+    int      id;         // fixed linear id in the pool
+    int      ref_count;  // all sequences referencing the block
+    int      use_count;  // active sequences using the block
+    uint64_t unique_id;  // unique for every block allocation
+    uint64_t timestamp;
+    void*    data;
+
+    friend std::ostream& operator<<(std::ostream& os, const Block& block);
+};
+
+inline bool is_active(const Block& block)
+{
+    return block.ref_count > 0 && block.use_count > 0;
+}
+
+inline bool is_cached(const Block& block)
+{
+    return block.ref_count > 0 && block.use_count == 0;
+}
+
+inline bool is_free(const Block& block)
+{
+    return block.ref_count == 0 && block.use_count == 0 && block.timestamp == 0;
+}
+
+struct Snapshot {
+    int              active;
+    int              cached;
+    int              free;
+    std::vector<int> use_count;
+};
+
+class BlockManager {
+public:
+    explicit BlockManager(size_t block_size, double block_count, int chunk_size, IAllocator* allocator);
+
+    ~BlockManager();
+
+    // free -> active (use_count = 1, ref_count = 1)
+    [[nodiscard]] std::vector<const Block*> Allocate(int count);
+
+    // cached -> active (use_count += 1)
+    [[maybe_unused]] int Lock(const std::vector<const Block*>& bs);
+
+    // active -> cached (use_count -= 1)
+    [[maybe_unused]] int Unlock(const std::vector<const Block*>& bs);
+
+    // cached -> free (ref_count = 0)
+    void Evict(int count);
+
+    // cached -> free (ref_count -= 1)
+    [[maybe_unused]] int Free(const std::vector<const Block*>& bs);
+
+    // increase timestamp in reversed order
+    void Touch(const std::vector<const Block*>& bs);
+
+    Snapshot TakeSnapshot();
+
+    int max_block_count() const noexcept
+    {
+        return max_block_count_;
+    }
+
+    int active_count() const noexcept
+    {
+        return active_ids_.size();
+    }
+
+    int cached_count() const noexcept
+    {
+        return cached_ids_.size();
+    }
+
+    int free_count() const noexcept
+    {
+        return (max_block_count_ - blocks_.size()) + free_ids_.size();
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const BlockManager&);
+
+private:
+    static size_t GetBlockCount(size_t block_size, double ratio);
+
+    // move indices between sets
+    static void Move(std::vector<int>& src, const std::vector<int>& delta, std::vector<int>& dst);
+
+    // allocate a chunk of blocks
+    bool Malloc();
+
+private:
+    size_t      block_size_;
+    int         max_block_count_{};
+    int         chunk_size_{};
+    IAllocator* allocator_;
+
+    std::vector<void*> chunks_;
+
+    std::vector<int> active_ids_;
+    std::vector<int> cached_ids_;
+    std::vector<int> free_ids_;
+
+    std::vector<Block> blocks_;  // < 100k
+
+    // uint64_t unique_id_{1UL << 63};
+    uint64_t unique_id_{1};
+    uint64_t timestamp_{1};
+};
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -10,6 +10,8 @@ add_library(Llama STATIC
        LlamaV2.cc
        LlamaBatch.cc
        LlamaCacheManager.cc
+        BlockManager.cc
+        SequenceManager.cc
        LlamaContextDecoder.cc
        LlamaContextAttentionLayer.cc
        LlamaDecoderSelfAttentionLayer.cc
@@ -28,6 +30,7 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
        DynamicDecodeLayer
        activation_kernels
        decoder_masked_multihead_attention
+        decoder_multihead_attention
        bert_preprocess_kernels
        decoding_kernels
        unfused_attention_kernels
@@ -48,4 +51,11 @@ endif()

 add_executable(llama_gemm llama_gemm.cc)
 target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
+
 install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)
+
+find_package(Catch2 3 QUIET)
+if (Catch2_FOUND)
+        add_executable(test_cache_manager test_cache_manager.cc)
+        target_link_libraries(test_cache_manager PRIVATE Llama Catch2::Catch2WithMain)
+endif ()
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -2,66 +2,139 @@

 #pragma once

-#include "src/turbomind/models/llama/LlamaCacheManager.h"
+// #include "src/turbomind/models/llama/LlamaCacheManager.h"
+#include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/Request.h"
+#include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
+#include <condition_variable>
+#include <mutex>

 namespace turbomind {

+struct BatchState {
+    int*  h_context_length;
+    bool* h_finished;
+
+    void* top_k_curand_state;
+    void* top_p_curand_state;
+    int*  output_ids;  // output ids in [B, S]
+
+    float* h_rope_theta;
+
+    std::vector<int> seq_len_limit;
+    std::vector<int> is_swap_in;
+
+    std::vector<const Sequence*>          sequences;
+    std::vector<std::shared_ptr<Request>> requests;
+
+    // |<-- existing -->|<-- swap-in -->|
+    // |<----------- active ----------->|<-- inactive -->|
+    int active_size;
+    int size;
+};
+
 template<typename T>
 class LlamaV2;

 template<typename T>
 class LlamaBatch {
 public:
-    int size() const noexcept
-    {
-        return batch_size_;
-    };
+    void AllocateBuffer(size_t batch_size, size_t session_len);
+    void AllocatePersistantBuffer(size_t max_batch_size);
+    void FreeBuffer();

-    int maxSize() const noexcept
-    {
-        return max_batch_size_;
-    }
+    using Requests = std::vector<std::shared_ptr<Request>>;
+    using Signal   = std::function<void()>;

-    int finishedCount() const noexcept
-    {
-        return finished_count_;
-    }
+    void RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs);
+
+    [[nodiscard]] auto ProcessStopRequests(const Requests& requests) -> std::vector<Signal>;
+
+    void ProcessInferRequests(const Requests& requests);

-    void verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
-                        std::vector<std::shared_ptr<Request>>& infer_reqs);
-    void handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests);
+    [[nodiscard]] bool Initialize();

-    void allocateBuffer(size_t batch_size, size_t session_len);
-    void allocatePersistantBuffer(size_t max_batch_size);
-    void freeBuffer();
+    void ContextDecode();

-    void initializeSampling(int infer_request_count);
+    struct GenerationState {
+        int max_init_ctx_len;
+        int step;
+        int sum_seq_len;
+        int max_seq_len;
+    };

-    void initialize(const std::vector<std::shared_ptr<Request>>& infer_requests);
-    void contextDecode();
+    void            InitializeSampling();
+    GenerationState InitializeGeneration();

-    void initializeGeneration();
-    bool generate();
+    [[nodiscard]] bool Generate(GenerationState& g);

-    void finish();
-    void finishRequest(int index, bool force_end);
+    [[nodiscard]] auto Finish(GenerationState& g) -> std::vector<Signal>;

-    void synchronize();
+    void CompleteRequest(int index, bool is_stop_request, bool is_force_end);

-    void setOutputTensors(int max_gen_step);
+    void SetOutputTensors(const GenerationState& g);

    void
-    outputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);
+    OutputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);

-    explicit LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama);
+    explicit LlamaBatch(int                              max_batch_size,
+                        int                              max_context_token_num,
+                        int                              session_len,
+                        std::unique_ptr<SequenceManager> sequence_manager,
+                        LlamaV2<T>*                      llama);

    ~LlamaBatch()
    {
-        freeBuffer();
+        TM_LOG_ERROR("~LlamaBatch()");
+        model_->shared_state_->request_queue.close();
+
+        internal_thread_.join();
+
+        if (output_thread_.joinable()) {
+            {
+                std::lock_guard lock{output_mutex_};
+                output_stop_token_ = true;
+            }
+            output_cv_.notify_one();
+            output_thread_.join();
+        }
+
+        FreeBuffer();
+    }
+
+    void Start();
+
+private:
+    void InternalThreadEntry(int device_id);
+
+    void OutputThreadEntry();
+
+    void UpdateSequenceStates(BatchState& state, int index);
+
+    void CopyState(const std::pair<BatchState*, int> _src, const std::pair<BatchState*, int>& _dst);
+
+    void SaveRandomState(BatchState& state, int idx);
+
+    void LoadRandomState(BatchState& state, int idx);
+
+    void BarrierSignalRequests(Barrier& barrier, const std::vector<Signal>& signals);
+
+    // analogs to `std::copy_n`
+    template<typename U>
+    U* Copy(const U* src, size_t count, U* dst)
+    {
+        check_cuda_error(cudaMemcpyAsync(dst, src, sizeof(U) * count, cudaMemcpyDefault, stream_));
+        return dst += count;
+    }
+
+    template<typename U>
+    U* Clear(U* data, size_t count)
+    {
+        check_cuda_error(cudaMemsetAsync(data, 0, sizeof(U) * count, stream_));
+        return data += count;
    }

 private:
@@ -70,52 +143,67 @@ private:
    const int  session_len_;
    const int  rank_;
    const bool debug_;
+    const int  step_length_;

-    LlamaV2<T>* const llama_;
-
-    // active requests
-    std::vector<std::shared_ptr<Request>> requests_;
-
-    T*   context_decoder_input_buf_{};   // CTXDEC
-    T*   context_decoder_output_buf_{};  // CTXDEC
-    int* context_decoder_ids_buf_{};
-
-    T* decoder_input_buf_{};   // CTXDEC, GENERATE
-    T* decoder_output_buf_{};  // CTXDEC, GENERATE
+    LlamaV2<T>* const model_;

-    int* input_ids_buf_{};       // input token ids + cache missed token ids, CTXDEC
-    int* input_length_buf_{};    // input + cache missed length, CTXDEC, GENERATE
-    int* history_length_buf_{};  // history length, CTXDEC
-    int* context_length_buf_{};  // history length + input_length, CTXDEC, GENERATE
+    std::unique_ptr<SequenceManager> sequence_manager_;

-    int* total_padding_count_{};  // GENERATE
-    int* sequence_lengths_{};     // current sequence length
+    ///////////////////////////////////////////////////////////////////
+    // k/v cache block buffers
+    int*       cu_block_counts_{};
+    uintptr_t* k_block_ptrs_{};
+    uintptr_t* v_block_ptrs_{};

-    uint64_t* k_cache_ptr_buf_{};
-    uint64_t* v_cache_ptr_buf_{};
+    ////////////////////////////////////////////////////////////////////
+    // context decoding temp buffers
+    T*   context_decoder_input_buf_{};
+    T*   context_decoder_output_buf_{};
+    int* context_decoder_ids_buf_{};
+    int* input_ids_buf_{};
+    // lengths
+    int* input_length_buf_{};    // input + cache missed length
+    int* context_length_buf_{};  // history length + input_length
+    // temp buffers used for block->linear kv-cache conversion
+    T*     tmp_k_cache_buf_{};
+    T*     tmp_v_cache_buf_{};
+    void** tmp_k_ptrs_{};
+    void** tmp_v_ptrs_{};
+    void** h_tmp_k_ptrs_{};
+    void** h_tmp_v_ptrs_{};
+
+    T*   decoder_input_buf_{};
+    T*   decoder_output_buf_{};
+    int* sequence_lengths_{};  // current sequence length
+    int* init_ctx_lens_{};

    float* logits_buf_{};        // combined logits
    float* local_logits_buf_{};  // tensor parallel local logits
    float* context_logits_buf_{};
    float* local_context_logits_buf_{};

+    float* rope_theta_{};
+
    // used by dynamic decoder
-    int*      token_ids_buf_{};   // all token IDs in [S, B], indexed using `step`
-    int*      output_ids_buf_{};  // output ids in [B, S]
+    int*      token_ids_buf_{};  // all token IDs in [S, B], indexed using `step`
    int*      end_ids_buf_{};
    bool*     finished_buf_{};
    uint32_t* seq_limit_len_{};

+    int** request_output_ids_ptrs_{};
+    int*  request_output_ids_lens_{};
+    int** request_seqlen_ptrs_{};
+    int** h_request_output_ids_ptrs_{};
+    int*  h_request_output_ids_lens_{};
+    int** h_request_seqlen_ptrs_{};
+
    // pinned buffers
    int*       h_input_ids_buf_{};
    int*       h_input_length_buf_{};
-    int*       h_history_length_buf_{};
-    int*       h_context_length_buf_{};
-    int*       h_sequence_lengths_{};
-    bool*      h_finished_buf_{};
-    uintptr_t* h_k_cache_ptr_buf_{};
-    uintptr_t* h_v_cache_ptr_buf_{};
    uint32_t*  h_seq_limit_len_{};
+    int*       h_cu_block_counts_{};
+    uintptr_t* h_k_block_ptrs_{};
+    uintptr_t* h_v_block_ptrs_{};

    int*      stop_words_buf_{};  // [batch_size, 2, kMaxStopWordsLen]
    int*      bad_words_buf_{};
@@ -125,24 +213,19 @@ private:
    float*    h_repetition_penalty_{};
    uint64_t* h_random_seed_{};

-    void* topk_curandstate_buf_{};
-    void* topp_curandstate_buf_{};
+    std::array<BatchState, 3> states_{};

-    // hard limits for persistent buffers
-    static constexpr int kMaxStopBadWordsLen = 32;
+    BatchState* state_{};
+    BatchState* back_{};
+    BatchState* incoming_{};

-    using CachedSeq = LlamaCacheManager::Sequence;
+    uint64_t request_count_{0};

-    std::vector<CachedSeq> cached_seq_;
-    std::vector<int>       request_seq_len_limit_;
+    // hard limits for persistent buffers
+    static constexpr int kMaxStopBadWordsLen = 32;

    const DataType data_type_{};

-    int batch_size_{};
-    int max_context_len_{};
-    int step_{};
-    int finished_count_{};
-
    bool is_allocate_persistant_buffer_ = false;
    bool is_allocate_buffer_            = false;

@@ -154,6 +237,15 @@ private:
    cudaStream_t     stream_{};
    cublasMMWrapper* cublas_wrapper_{};
    IAllocator*      allocator_{};
+
+    std::thread internal_thread_;
+
+    // async stream callback utils
+    std::thread             output_thread_;
+    std::mutex              output_mutex_;
+    std::condition_variable output_cv_;
+    Requests                output_reqs_;
+    bool                    output_stop_token_{false};
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -21,6 +21,7 @@

 #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
 #include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/decoder_multihead_attention/kv_cache.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
@@ -28,6 +29,7 @@
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"

 namespace turbomind {
@@ -116,6 +118,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
     *   \param history_lengths [batch_size], int
     *   \param context_lengths [batch_size], int
     *   \param cu_seqlens [batch_size+1], int
+     *   \param cu_block_counts [batch_size+1], int
     *   \param max_seq_len [1], int on cpu
     *   \param is_final_layer [1], bool on cpu
     *   \param layer_id [1], int on cpu
@@ -141,13 +144,23 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
    T* attention_input = input_tensors->at("input_query").getPtr<T>();
    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();

-    const auto input_length   = input_tensors->at("input_lengths").getPtr<const int>();
-    const auto history_length = input_tensors->at("history_lengths").getPtr<const int>();
-    const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
-    int*       cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
+    const auto input_length    = input_tensors->at("input_lengths").getPtr<const int>();
+    const auto context_length  = input_tensors->at("context_lengths").getPtr<const int>();
+    int*       cu_seqlens      = input_tensors->at("cu_seqlens").getPtr<int>();
+    int*       cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
+
+    const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);

    const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();

+    auto Show = [&](const T* x, size_t n) {
+        std::vector<T> vec(n);
+        cudaMemcpyAsync(vec.data(), x, sizeof(T) * n, cudaMemcpyDefault, stream_);
+        cudaStreamSynchronize(stream_);
+        std::vector<float> float_vec(vec.begin(), vec.end());
+        dbg(float_vec);
+    };
+
    /////////////////////////////////////////////
    /// allocate buffers
    allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
@@ -166,26 +179,32 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                   qkv_buf_,
                                   weights->qkv.bias,
                                   padding_offset,  // padding_offset,
-                                   history_length,  // used for applying rotary embedding
+                                   context_length,  // used for applying rotary embedding
                                   input_length,
+                                   rope_theta,
                                   batch_size,
                                   max_q_len,  // seq_len
                                   num_token,  // batch_size * seq_len
                                   local_head_num_,
                                   local_kv_head_num_,
                                   size_per_head_,
-                                   params_.rotray_embedding_dim,
+                                   params_.rotary_embedding_dim,
                                   params_.rotary_embedding_base,
                                   params_.max_position_embeddings,
-                                   params_.use_dynamic_ntk,
+                                   false,  // params_.use_dynamic_ntk,
                                   params_.use_logn_attn,
                                   stream_);
    sync_check_cuda_error();

-    const size_t layer_offset = layer_id * local_kv_head_num_ * max_seq_len * size_per_head_;
+    // [2, L, H, s, D]
+    const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
+
+    auto k_cache_ptrs = output_tensors->getPtr<void*>("key_cache");
+    auto v_cache_ptrs = output_tensors->getPtr<void*>("value_cache");
+
+    auto tmp_k_ptrs = output_tensors->getPtr<T*>("tmp_k");
+    auto tmp_v_ptrs = output_tensors->getPtr<T*>("tmp_v");

-    auto k_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
-    auto v_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
    //////////////////////////////////////////////////////////
    /// insert the k/v computed from inputs into k/v cache
    /// transpose kv -> kv cache
@@ -194,25 +213,53 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
    // v_buf_2 [B, kvH, s, D] -> val_cache [B, kvH, S[t:t+s], D/x, x]
    invokeExtendKVCache(k_cache_ptrs,
                        v_cache_ptrs,
-                        layer_offset,
                        k_buf_2_,
                        v_buf_2_,
-                        batch_size,
+                        cu_block_counts,
                        input_length,
+                        context_length,
+                        batch_size,
+                        kv_cache_block_len_,
+                        layer_offset,
                        max_q_len,
-                        history_length,
-                        max_seq_len,
                        size_per_head_,
                        local_kv_head_num_,
-                        stream_,
                        quant_policy_,
-                        weights->past_kv_scale.data());
+                        weights->past_kv_scale.data(),
+                        stream_);
+    sync_check_cuda_error();
+
+    const int kv_cache_elem_bits = quant_policy_ & QuantPolicy::kCacheKVInt8 ? 8 : sizeof(T) * 8;

+    ConvertKvCacheBlocksToLinear2((const void**)k_cache_ptrs,
+                                  (const void**)v_cache_ptrs,
+                                  (T**)tmp_k_ptrs,
+                                  (T**)tmp_v_ptrs,
+                                  cu_block_counts,
+                                  context_length,
+                                  layer_offset,
+                                  kv_cache_block_len_,
+                                  max_seq_len,
+                                  local_kv_head_num_,
+                                  size_per_head_,
+                                  batch_size,
+                                  quant_policy_,
+                                  weights->past_kv_scale.data(),
+                                  stream_);
    sync_check_cuda_error();
+
+    // dbg(kv_cache_block_len_, max_seq_len, local_kv_head_num_, size_per_head_, batch_size);
+    // void *kk, *vv;
+    // cudaMemcpyAsync(&kk, tmp_k_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
+    // cudaMemcpyAsync(&vv, tmp_v_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
+    // cudaStreamSynchronize(stream_);
+    // Show((const T*)kk, local_kv_head_num_ * max_seq_len * size_per_head_);
+    // Show((const T*)vv, local_kv_head_num_ * max_seq_len * size_per_head_);
+
    if (use_fmha_) {
-        fusedMultiHeadAttention(k_cache_ptrs,
-                                v_cache_ptrs,
-                                layer_offset,
+        fusedMultiHeadAttention(tmp_k_ptrs,
+                                tmp_v_ptrs,
+                                0,
                                attention_mask,
                                cu_seqlens,
                                input_tensors->at("context_lengths").getPtr<int>(),
@@ -222,9 +269,9 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                max_seq_len);
    }
    else {
-        unfusedMultiHeadAttention(k_cache_ptrs,
-                                  v_cache_ptrs,
-                                  layer_offset,
+        unfusedMultiHeadAttention(tmp_k_ptrs,
+                                  tmp_v_ptrs,
+                                  0,
                                  attention_mask,
                                  padding_offset,
                                  context_length,
@@ -237,6 +284,14 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                  weights->past_kv_scale.data());
    }

+    // Compare(qkv_buf_3_, num_token * hidden_units_, Concat("qkv_buf_3", layer_id), kCmpRead, stream_);
+
+    // dbg(max_seq_len);
+
+    if (0) {
+        Show(qkv_buf_3_, num_token * hidden_units_);
+    }
+
    //////////////////////////////////////////////
    /// output gemm <Bs,HD> -> <Bs,HD>
    linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
@@ -342,7 +397,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_c
                           local_head_num_,
                           head_n_rep_,
                           stream_,
-                           quant,
+                           0,  // dequant handled in block->linear conversion
                           kv_scale);
    sync_check_cuda_error();


--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -45,6 +45,7 @@ public:
                               IAllocator*          allocator,
                               bool                 is_free_buffer_after_forward,
                               bool                 use_fmha,
+                               int                  cache_block_seq_len,
                               int                  quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
@@ -58,6 +59,7 @@ public:
        cublas_wrapper_(cublas_wrapper),
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
+        kv_cache_block_len_(cache_block_seq_len),
        is_free_buffer_after_forward_(is_free_buffer_after_forward),
        use_fmha_(use_fmha),
        quant_policy_(quant_policy)
@@ -99,6 +101,7 @@ private:
    const size_t local_kv_head_num_;
    const size_t local_head_num_;
    const size_t head_n_rep_;
+    const size_t kv_cache_block_len_;
    const bool   is_free_buffer_after_forward_;

    const LlamaAttentionParams params_;

--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -25,7 +25,9 @@
 #include "src/turbomind/models/llama/LlamaContextDecoder.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/debug_utils.h"

 namespace turbomind {

@@ -64,6 +66,7 @@ template<typename T>
 void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
                                        size_t                      kv_head_num,
                                        bool                        use_fmha,
+                                        int                         cache_block_seq_len,
                                        int                         quant_policy)
 {
    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
@@ -78,6 +81,7 @@ void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
                                                                 allocator_,
                                                                 is_free_buffer_after_forward_,
                                                                 use_fmha,
+                                                                 cache_block_seq_len,
                                                                 quant_policy);

    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
@@ -93,6 +97,7 @@ void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
 template<typename T>
 void LlamaContextDecoder<T>::forwardSelfAttn(const Session&                                 sess,
                                             T*                                             attn_io,
+                                             std::unordered_map<std::string, Tensor>*       output_tensors,
                                             const std::unordered_map<std::string, Tensor>* input_tensors,
                                             int                                            layer,
                                             bool                                           is_final)
@@ -107,18 +112,17 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
        {"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
        {"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
-        {"history_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.history_length}},
        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
+        {"cu_block_counts", input_tensors->at("cu_block_counts")},
+        {"rope_theta", input_tensors->at("rope_theta")},
        {"max_seq_len", input_tensors->at("max_seq_len")}};

-    auto& k_cache = *sess.k_cache;
-    auto& v_cache = *sess.v_cache;
-
    TensorMap self_attention_output_tensors{
        {"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
-        {"key_cache", k_cache},
-        {"value_cache", v_cache},
-    };
+        {"key_cache", output_tensors->at("key_cache")},
+        {"value_cache", output_tensors->at("value_cache")},
+        {"tmp_k", output_tensors->at("tmp_k")},
+        {"tmp_v", output_tensors->at("tmp_v")}};

    context_attention_layer_->forward(&self_attention_output_tensors,  //
                                      &self_attention_input_tensors,
@@ -139,6 +143,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t                      head_num
                                            IAllocator*                 allocator,
                                            bool                        is_free_buffer_after_forward,
                                            bool                        use_fmha,
+                                            int                         cache_block_seq_len,
                                            int                         quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
@@ -150,7 +155,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t                      head_num
    tensor_para_(tensor_para),
    data_type_(getTensorType<T>())
 {
-    initialize(attn_params, kv_head_num, use_fmha, quant_policy);
+    initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
 }

 template<typename T>
@@ -201,17 +206,16 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
    sess.weights       = decoder_layer_weights;

    sess.input_length   = input_tensors->at("input_lengths").getPtr<int>();
-    sess.history_length = input_tensors->at("history_lengths").getPtr<int>();
    sess.context_length = input_tensors->at("context_lengths").getPtr<int>();

    T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
    T* decoder_output       = output_tensors->at("decoder_output").getPtr<T>();

-    sess.k_cache = &output_tensors->at("key_cache");
-    sess.v_cache = &output_tensors->at("value_cache");
-
    allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);

+    // dbg(padding_offset_);
+    FT_CHECK(padding_offset_);
+
    size_t tmp_token_num{};
    invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                       &tmp_token_num,  // updated token num
@@ -222,6 +226,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                       sess.max_query_len,
                                       stream_);
    sync_check_cuda_error();
+    dbg(tmp_token_num, sess.token_num);
    FT_CHECK(tmp_token_num == sess.token_num);

    invokeCreateCausalMasks(attention_mask_,
@@ -233,6 +238,9 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                            stream_);
    sync_check_cuda_error();

+    // Compare(
+    //     decoder_input_output, sess.token_num * hidden_units_, Concat("context_decoder_input", 0), kCmpRead, stream_);
+
    /////////////////////////////////////////////
    /// RMSNorm
    invokeRootMeanSquareNorm(decoder_output,
@@ -247,7 +255,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
    for (size_t layer = 0; layer < num_layer_; ++layer) {
        /////////////////////////////////////////////
        /// self-attention
-        forwardSelfAttn(sess, decoder_output, input_tensors, layer, false);
+        forwardSelfAttn(sess, decoder_output, output_tensors, input_tensors, layer, false);

        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
                                          decoder_output,

--- a/src/turbomind/models/llama/LlamaContextDecoder.h
+++ b/src/turbomind/models/llama/LlamaContextDecoder.h
@@ -40,7 +40,11 @@ protected:
    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
    void freeBuffer() override;

-    void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, bool use_fmha, int quant_policy);
+    void initialize(const LlamaAttentionParams& attn_params,
+                    size_t                      kv_head_num,
+                    bool                        use_fmha,
+                    int                         cache_block_seq_len,
+                    int                         quant_policy);

    size_t head_num_;
    size_t size_per_head_;
@@ -63,21 +67,19 @@ protected:
    const DataType data_type_;

    struct Session {
-        size_t  batch_size;
-        size_t  token_num;
-        size_t  max_query_len;
-        size_t  max_key_len;
-        Tensor* k_cache;
-        Tensor* v_cache;
-        int*    input_length{};
-        int*    history_length{};
-        int*    context_length{};
+        size_t batch_size;
+        size_t token_num;
+        size_t max_query_len;
+        size_t max_key_len;
+        int*   input_length{};
+        int*   context_length{};

        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
    };

    void forwardSelfAttn(const Session&                                 sess,
                         T*                                             attn_io,
+                         std::unordered_map<std::string, Tensor>*       output_tensors,
                         const std::unordered_map<std::string, Tensor>* input_tensors,
                         int                                            layer,
                         bool                                           is_final);
@@ -96,6 +98,7 @@ public:
                        IAllocator*                 allocator,
                        bool                        is_free_buffer_after_forward,
                        bool                        use_fmha,
+                        int                         cache_block_seq_len,
                        int                         quant_policy);

    ~LlamaContextDecoder() override;

--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -41,6 +41,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                      head_num,
                              cublasMMWrapper*            cublas_wrapper,
                              IAllocator*                 allocator,
                              bool                        is_free_buffer_after_forward,
+                              int                         cache_block_seq_len,
                              int                         quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
@@ -53,7 +54,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                      head_num,
    data_type_(getTensorType<T>())
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize(attn_params, kv_head_num, quant_policy);
+    initialize(attn_params, kv_head_num, cache_block_seq_len, quant_policy);
 }

 template<typename T>
@@ -65,7 +66,10 @@ LlamaDecoder<T>::~LlamaDecoder()
 }

 template<typename T>
-void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy)
+void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
+                                 size_t                      kv_head_num,
+                                 int                         cache_block_seq_len,
+                                 int                         quant_policy)
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

@@ -78,6 +82,7 @@ void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params, size_t
                                                                  cublas_wrapper_,
                                                                  allocator_,
                                                                  is_free_buffer_after_forward_,
+                                                                  cache_block_seq_len,
                                                                  quant_policy);

    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
@@ -118,6 +123,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&
                                      const std::unordered_map<std::string, Tensor>* input_tensors,
                                      size_t                                         layer)
 {
+    NvtxScope scope("self_attn");
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    TensorMap self_attention_input_tensors(*input_tensors);
    self_attention_input_tensors.insert("input_query",
@@ -180,60 +186,73 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou

    // for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp

+    NvtxScope forward_scope("decoder_forward");
+
    Session sess{};
    sess.batch_size = input_tensors->at("decoder_input").shape[0];
    sess.weights    = decoder_layer_weights;

    allocateBuffer(sess.batch_size);

-    sess.ite     = input_tensors->at("ite").getVal<const int>();
    sess.k_cache = &output_tensors->at("key_cache");
    sess.v_cache = &output_tensors->at("value_cache");

-    sess.max_memory_len = input_tensors->at("max_seq_len").getVal<int>();
-
    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();

+    int step = input_tensors->at("step").getVal<int>();
+    // Compare(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0, step), kCmpRead, stream_);
+
    ////////////////////////////////////////////
    /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input,
-                             decoder_layer_weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             sess.batch_size,
-                             hidden_units_,
-                             stream_);
-    sync_check_cuda_error();
+    {
+        NvtxScope rms_norm_scope("rms_norm_0");
+        invokeRootMeanSquareNorm(decoder_output,
+                                 decoder_input,
+                                 decoder_layer_weights->at(0)->self_attn_norm_weights,
+                                 rmsnorm_eps_,
+                                 sess.batch_size,
+                                 hidden_units_,
+                                 stream_);
+        sync_check_cuda_error();
+    }

    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        NvtxScope layer_scope("decode_layer");
+
        // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
        forwardSelfAttn(sess, decoder_output, input_tensors, layer);

-        invokeFusedAddBiasResidualRMSNorm(decoder_input,
-                                          decoder_output,
-                                          decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
-                                          decoder_layer_weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          sess.batch_size,
-                                          hidden_units_,
-                                          stream_);
-        sync_check_cuda_error();
+        {
+            NvtxScope rms_norm_scope("rms_norm_1");
+            invokeFusedAddBiasResidualRMSNorm(decoder_input,
+                                              decoder_output,
+                                              decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
+                                              decoder_layer_weights->at(layer)->ffn_norm_weights,
+                                              rmsnorm_eps_,
+                                              sess.batch_size,
+                                              hidden_units_,
+                                              stream_);
+            sync_check_cuda_error();
+        }

        // decoder_layer_output_ = ffn(decoder_normed_input_)
        forwardFfn(sess, decoder_output, layer);

-        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
-                                                     input_tensors->at("output_norm_weight").getPtr<T>();
-        invokeFusedAddBiasResidualRMSNorm(decoder_input,  //
-                                          decoder_output,
-                                          decoder_layer_weights->at(layer)->ffn_weights.output.bias,
-                                          scale_weight,
-                                          rmsnorm_eps_,
-                                          sess.batch_size,
-                                          hidden_units_,
-                                          stream_);
-        sync_check_cuda_error();
+        {
+            NvtxScope rms_norm_scope("rms_norm_2");
+            auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
+                                                         input_tensors->at("output_norm_weight").getPtr<T>();
+            invokeFusedAddBiasResidualRMSNorm(decoder_input,  //
+                                              decoder_output,
+                                              decoder_layer_weights->at(layer)->ffn_weights.output.bias,
+                                              scale_weight,
+                                              rmsnorm_eps_,
+                                              sess.batch_size,
+                                              hidden_units_,
+                                              stream_);
+            sync_check_cuda_error();
+        }
    }

    if (is_free_buffer_after_forward_) {

--- a/src/turbomind/models/llama/LlamaDecoder.h
+++ b/src/turbomind/models/llama/LlamaDecoder.h
@@ -35,7 +35,8 @@ protected:
    void allocateBuffer() override;  // deprecated
    void allocateBuffer(size_t batch_size);
    void freeBuffer() override;
-    void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy);
+    void
+    initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);

    size_t head_num_;
    size_t size_per_head_;
@@ -53,8 +54,6 @@ protected:

    struct Session {
        size_t                                          batch_size;
-        int                                             ite;
-        size_t                                          max_memory_len;
        Tensor*                                         k_cache;
        Tensor*                                         v_cache;
        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
@@ -80,6 +79,7 @@ public:
                 cublasMMWrapper*            cublas_wrapper,
                 IAllocator*                 allocator,
                 bool                        is_free_buffer_after_forward,
+                 int                         cache_block_seq_len,
                 int                         quant_policy);

    ~LlamaDecoder() override;

--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -302,7 +302,7 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
        self_attn_weights.past_kv_scale = loadArrayFromBin({4}, scale_path);
    }
    else {
-        self_attn_weights.past_kv_scale = {};
+        self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f};
    }
 }


--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -24,6 +24,7 @@
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"

 namespace turbomind {
@@ -32,7 +33,7 @@ template<typename T>
 class LlamaDecoderSelfAttentionLayer {
 public:
    void freeBuffer();
-    void allocateBuffer(size_t batch_size, int key_len, int max_memory_len);
+    void allocateBuffer(size_t batch_size);

    LlamaDecoderSelfAttentionLayer(size_t                      head_num,
                                   size_t                      kv_head_num,
@@ -43,6 +44,7 @@ public:
                                   cublasMMWrapper*            cublas_wrapper,
                                   IAllocator*                 allocator,
                                   bool                        is_free_buffer_after_forward,
+                                   int                         cache_block_seq_len,
                                   int                         quant_policy):
        head_num_(head_num),
        kv_head_num_(kv_head_num),
@@ -56,9 +58,11 @@ public:
        stream_(stream),
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
+        kv_cache_block_len_(cache_block_seq_len),
        is_free_buffer_after_forward_(is_free_buffer_after_forward),
        quant_policy_(quant_policy)
    {
+        arch_ = getSMVersion();
    }

    ~LlamaDecoderSelfAttentionLayer()
@@ -76,6 +80,7 @@ private:
    const size_t local_head_num_;
    const size_t local_kv_head_num_;
    const size_t local_hidden_units_;
+    const size_t kv_cache_block_len_;
    const bool   is_free_buffer_after_forward_;
    const int    quant_policy_;

@@ -90,7 +95,11 @@ private:
    T* qkv_buf_     = nullptr;
    T* context_buf_ = nullptr;

+    static constexpr int kMaxSplitK = 16;  // must be <= WARP_SIZE
+    float*               workspace_ = nullptr;
+
    bool is_allocate_buffer_{};
+    int  arch_{};
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc