TurboMind 2 (#590)

* refresh decoder attention kernel * block-level kv cache * `BlockManager` & `SequenceManager` * update * update * update * update * rename * GQA support * fix context length * GQA dispatch * kv8 * tune * async stream cb * nvtx * config parsing * debug * optimize output cost * split-k decoding * minor * truncate `session_len` by available blocks * minor * license * fix * dispatch `cp.async` * fix linking * fix * fix deadlock * guard input length * correct start offset * fix prefill chunking * fix `cache_block_seq_len` param passing * fix `block_size` fmtstr * fix output tokens * fix batch resizing * fix masking of finished sequences * add debug util * free unused block early * add ntk scaling and logn scaling * cmake flags * fix typo * w4a16 for sm75 * fix msvc build * fix msvc build * fix block verification * fix msvc build * use `std::shuffle` * fix lint * fix lint * fix lint * clear incoming buffer * clear finished requests * fix batch initialization * fix typo * fix typo * fix comparison

TurboMind 2 (#590)
* refresh decoder attention kernel * block-level kv cache * `BlockManager` & `SequenceManager` * update * update * update * update * rename * GQA support * fix context length * GQA dispatch * kv8 * tune * async stream cb * nvtx * config parsing * debug * optimize output cost * split-k decoding * minor * truncate `session_len` by available blocks * minor * license * fix * dispatch `cp.async` * fix linking * fix * fix deadlock * guard input length * correct start offset * fix prefill chunking * fix `cache_block_seq_len` param passing * fix `block_size` fmtstr * fix output tokens * fix batch resizing * fix masking of finished sequences * add debug util * free unused block early * add ntk scaling and logn scaling * cmake flags * fix typo * w4a16 for sm75 * fix msvc build * fix msvc build * fix block verification * fix msvc build * use `std::shuffle` * fix lint * fix lint * fix lint * clear incoming buffer * clear finished requests * fix batch initialization * fix typo * fix typo * fix comparison
ab1767cf · Li Zhang · GitHub · 06125966 · ab1767cf · ab1767cf
Unverified Commit ab1767cf authored Nov 10, 2023 by Li Zhang Committed by GitHub Nov 10, 2023
15 changed files
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -28,6 +28,8 @@
 #include "src/turbomind/models/llama/LlamaDecoder.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/Request.h"
+#include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/instance_comm.h"
@@ -46,9 +48,7 @@ public:
        std::vector<std::shared_ptr<Request>> stop_requests;
        RequestQueue                          request_queue;
        std::shared_ptr<Barrier>              barrier;
-
-        // rank 0 sets flag to true if there are no more tasks in the request_queue
-        bool should_stop = false;
+        bool                                  abort;
    };

    ~LlamaV2();
@@ -67,7 +67,8 @@ public:
            int                          step_length,
            int                          start_id,
            int                          end_id,
-            int                          cache_max_entry_count,
+            float                        cache_max_block_count,
+            int                          cache_block_seq_len,
            int                          cache_chunk_size,
            int                          quant_policy,
            bool                         use_context_fmha,
@@ -104,39 +105,45 @@ public:
 private:
    friend class Batch;

-    void internalThreadEntry(int device_id);
-
-    void
-    initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, bool use_context_fmha, int quant_policy);
+    void initialize(const LlamaAttentionParams& attn_params,
+                    size_t                      kv_head_num,
+                    bool                        use_context_fmha,
+                    int                         cache_block_seq_len,
+                    int                         quant_policy);

    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);

-    void contextDecode(T*         deocder_output,
-                       uintptr_t* k_cache_ptr,
-                       uintptr_t* v_cache_ptr,
-                       T*         context_decoder_input_buf,
-                       T*         context_decoder_output_buf,
-                       const int* input_ids,
-                       const int* input_length,
-                       const int* history_length,
-                       const int* context_length,
-                       size_t     token_num,
-                       size_t     max_input_len,
-                       size_t     max_context_len,
-                       size_t     session_len,
-                       size_t     batch_size);
-
-    void decoderForward(T*         decoder_output,
-                        uintptr_t* k_cache_ptr,
-                        uintptr_t* v_cache_ptr,
-                        T*         decoder_input,
-                        const int* sequence_length,
-                        const int* total_padding_count,
-                        bool*      finished,
-                        int        step,
-                        int        ite,
-                        size_t     session_len,
-                        size_t     batch_size);
+    void contextDecode(T*           deocder_output,
+                       uintptr_t*   k_block_ptrs,
+                       uintptr_t*   v_block_ptrs,
+                       void**       k_tmp_ptrs,
+                       void**       v_tmp_ptrs,
+                       T*           context_decoder_input_buf,
+                       T*           context_decoder_output_buf,
+                       const int*   input_ids,
+                       const int*   input_length,
+                       const int*   context_length,
+                       const int*   cu_block_counts,
+                       const float* rope_theta,
+                       size_t       token_num,
+                       size_t       max_input_len,
+                       size_t       max_context_len,
+                       size_t       session_len,
+                       size_t       batch_size);
+
+    void decoderForward(T*           decoder_output,
+                        uintptr_t*   k_cache_ptr,
+                        uintptr_t*   v_cache_ptr,
+                        T*           decoder_input,
+                        const int*   sequence_length,
+                        const bool*  finished,
+                        const int*   cu_block_counts,
+                        const float* rope_theta,
+                        int          step,
+                        int          ite,
+                        int          sum_seq_len,
+                        int          max_seq_len,
+                        size_t       batch_size);

    void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);

@@ -156,7 +163,15 @@ private:
                       size_t          token_ids_len,
                       size_t          batch_size);

-    void start();
+    curandState_t* GetTopKState(int index)
+    {
+        return dynamic_decode_layer_->topk_curandstate_buf() + index;
+    }
+
+    curandState_t* GetTopPState(int index)
+    {
+        return dynamic_decode_layer_->topp_curandstate_buf() + index;
+    }

 private:
    friend class LlamaBatch<T>;
@@ -169,6 +184,8 @@ private:
    size_t       vocab_size_padded_;
    float        rmsnorm_eps_ = 1e-6f;

+    const LlamaAttentionParams attn_params_;
+
    static constexpr bool neox_rotary_style_ = false;

    const int    start_id_;
@@ -176,6 +193,7 @@ private:
    const size_t hidden_units_;

    const size_t local_head_num_;
+    const size_t local_kv_head_num_;
    NcclParam    tensor_para_;

    cudaStream_t     stream_;
@@ -186,20 +204,15 @@ private:

    const bool debug_{false};

-    std::unique_ptr<LlamaCacheManager> kv_cache_mgr_;
-
    LlamaWeight<T>*            weights_{};
    LlamaDecoder<T>*           decoder_{};
    LlamaContextDecoder<T>*    context_decoder_{};
    DynamicDecodeLayer<float>* dynamic_decode_layer_{};

-    const int                    step_length_;
-    LlamaBatch<T>                batch_;
-    std::shared_ptr<SharedState> shared_state_;
-
-    std::thread internal_thread_;
-
-    ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
+    const int                      step_length_;
+    std::shared_ptr<SharedState>   shared_state_;
+    ffi_api_lock_ctrl_t            ffi_lock_;
+    std::unique_ptr<LlamaBatch<T>> batch_;
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -14,9 +14,11 @@ namespace turbomind {

 struct Request {
    uint64_t id;
-    bool     start_flag;
-    bool     end_flag;
-    bool     stop_flag;
+    uint64_t priority;
+
+    bool start_flag;
+    bool end_flag;
+    bool stop_flag;

    // per rank inputs/outputs
    std::vector<TensorMap> inputs;
@@ -31,7 +33,8 @@ struct Request {
        kConflict = 2,
        kBusy     = 3,
        kInactive = 4,
-        kFail     = 5
+        kFail     = 5,
+        kTooLong  = 6
    };
    std::promise<int> signal;
 };
@@ -66,11 +69,16 @@ public:
    void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
                 std::vector<std::shared_ptr<Request>>& infer_requests,
                 unsigned                               max_infer_count,
-                 bool                                   blocking)
+                 bool                                   blocking,
+                 bool&                                  abort)
    {
        std::unique_lock<std::mutex> lock(mutex_);
        if (blocking) {
-            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty() && closed_ == false); });
+            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; });
+            if (closed_) {
+                abort = true;
+                return;
+            }
        }

        stop_requests.clear();
@@ -88,8 +96,10 @@ public:

    void close()
    {
-        std::lock_guard<std::mutex> lock(mutex_);
-        closed_ = true;
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            closed_ = true;
+        }
        cv_.notify_all();
    }

@@ -98,7 +108,7 @@ private:
    std::queue<std::shared_ptr<Request>> infer_queue_;
    std::mutex                           mutex_;
    std::condition_variable              cv_;
-    bool                                 closed_ = false;
+    bool                                 closed_{false};
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/models/llama/BlockManager.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/debug_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include <cstddef>
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <stdexcept>
+
+namespace turbomind {
+
+SequenceManager::SequenceManager(size_t      layer_num,
+                                 size_t      head_num,
+                                 size_t      head_dim,
+                                 size_t      block_seq_len,
+                                 double      block_count,
+                                 int         chunk_size,
+                                 size_t      elem_bits,
+                                 int         rank,
+                                 IAllocator* allocator):
+    block_seq_len_(block_seq_len)
+{
+    constexpr int kBitsPerByte = 8;
+
+    // [2, L, H, block_seq_len, D]
+    size_t block_size = 2UL * layer_num * head_num * block_seq_len * head_dim * elem_bits / kBitsPerByte;
+
+    block_manager_ = std::make_unique<BlockManager>(block_size, block_count, chunk_size, allocator);
+
+    val_offset_ = block_size / 2;
+}
+
+const Sequence* SequenceManager::Create(uint64_t id)
+{
+    Sequence sequence{id, {}, {}, {}, {}, {}, {}, 0.f};
+
+    auto it = sequences_.find(id);
+    if (it != sequences_.end()) {
+        if (rank_ == 0) {
+            TM_LOG_WARNING("[SequenceManager][Create] Removing conflicting ID %ld", (long)id);
+        }
+        auto& seq = it->second;
+        if (seq.status != Sequence::kCached) {
+            unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
+        }
+        seq = std::move(sequence);
+    }
+    else {
+        it = sequences_.emplace_hint(it, id, std::move(sequence));
+    }
+
+    return &it->second;
+}
+
+const Sequence* SequenceManager::Get(uint64_t id)
+{
+    if (auto it = sequences_.find(id); it != sequences_.end()) {
+        auto& sequence = it->second;
+        return &it->second;
+    }
+    return nullptr;
+}
+
+bool SequenceManager::Contains(uint64_t id)
+{
+    return sequences_.find(id) != sequences_.end();
+}
+
+bool SequenceManager::Erase(uint64_t id)
+{
+    if (auto it = sequences_.find(id); it != sequences_.end()) {
+        auto& seq = it->second;
+        if (seq.status != Sequence::kCached) {
+            unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
+            freed_.insert(freed_.end(), seq.blocks.begin(), seq.blocks.end());
+        }
+        else {
+            for (int i = 0; i < seq.blocks.size(); ++i) {
+                // filter invalidated blocks
+                if (seq.blocks[i]->unique_id == seq.block_unique_ids[i]) {
+                    freed_.push_back(seq.blocks[i]);
+                }
+            }
+        }
+        sequences_.erase(it);
+    }
+    else {
+        throw std::out_of_range(std::to_string(id));
+    }
+
+    return false;
+}
+
+void SequenceManager::VerifyAndLockCached(const Sequences& sequences)
+{
+    std::vector<const Block*> blocks;
+    for (const auto& p : sequences) {
+        auto& seq = const_cast<Sequence&>(*p);
+        if (seq.status != Sequence::kCached) {
+            continue;
+        }
+        FT_CHECK(seq.blocks.size() == seq.block_unique_ids.size());
+        if (need_verify_) {
+            for (int i = 0; i < seq.blocks.size(); ++i) {
+                if (seq.blocks[i]->unique_id != seq.block_unique_ids[i]) {
+                    seq.blocks.resize(i);
+                    seq.block_unique_ids.resize(i);
+                    break;
+                }
+            }
+        }
+        blocks.insert(blocks.end(), seq.blocks.begin(), seq.blocks.end());
+        seq.cache_len = std::min<int>(seq.cache_len, seq.blocks.size() * block_seq_len_);
+        seq.status    = Sequence::kLocked;
+    }
+    block_manager_->Lock(blocks);
+    need_verify_ = false;
+}
+
+void SequenceManager::CommitUnlockAndFree()
+{
+    if (!unlocked_.empty()) {
+        block_manager_->Unlock(unlocked_);
+        unlocked_.clear();
+    }
+
+    if (!freed_.empty()) {
+        block_manager_->Free(freed_);
+        freed_.clear();
+    }
+}
+
+void SequenceManager::UpdateAndSetUnlock(const Sequence& sequence)
+{
+    FT_CHECK(sequence.status != Sequence::kCached);
+    auto& seq = const_cast<Sequence&>(sequence);
+    block_manager_->Touch(seq.blocks);
+    unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
+    seq.status = Sequence::kCached;
+}
+
+namespace {
+
+struct Schedule {
+    int free;
+    int cached;
+
+    int allocate{};
+    int evict{};
+    int preempt{};
+
+    int last;
+
+    Sequences        active;
+    std::vector<int> block_counts;
+    Sequences        inactive;
+    Sequences        victims;
+
+    Schedule(Snapshot snapshot, int size):
+        free(snapshot.free),
+        cached(snapshot.cached),
+        last(size),
+        use_count_(std::move(snapshot.use_count)),
+        unlocked_(size),
+        it_(size)
+    {
+    }
+
+    int Unlock(const Sequences& seqs, int vidx)
+    {
+        while (vidx < it_) {
+            const auto& blocks = seqs[--it_]->blocks;
+            int         count  = 0;
+            for (const auto& p : blocks) {
+                count += static_cast<int>(--use_count_[p->id] == 0);
+            }
+            unlocked_[it_] = count;
+        }
+        return unlocked_[vidx];
+    }
+
+private:
+    std::vector<int> use_count_;
+    std::vector<int> unlocked_;
+    int              it_;
+};
+
+template<typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << "[";
+    for (int i = 0; i < v.size(); ++i) {
+        os << (i ? "," : "") << v[i];
+    }
+    os << "]";
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Schedule& s)
+{
+    os << "free=" << s.free << ", cached=" << s.cached << ", allocate=" << s.allocate << ", evict=" << s.evict
+       << ", preempt=" << s.preempt << ", active=" << s.active << ", victims=" << s.victims
+       << ", block_counts=" << s.block_counts << ", inactive=" << s.inactive;
+    return os;
+}
+
+struct Transaction {
+    int index_;
+    int block_count_;
+
+    int allocate_{};
+    int evict_{};
+    int preempt_{};
+
+    Sequences victims_;
+
+    const Sequences& sequences_;
+    Schedule&        schedule_;
+
+    explicit Transaction(const Sequences& sequences, int index, int block_count, Schedule& sched):
+        sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count)
+    {
+    }
+
+    void Process()
+    {
+        int count = block_count_;
+
+        int tmp = std::min(schedule_.free, count);
+        count -= tmp;
+        allocate_ += tmp;
+
+        tmp = std::min(schedule_.cached, count);
+        count -= tmp;
+        evict_ += tmp;
+
+        for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
+            if (sequences_[vidx]->status == Sequence::kCached) {
+                continue;
+            }
+            victims_.push_back(sequences_[vidx]);
+            preempt_ += schedule_.Unlock(sequences_, vidx);
+
+            if (count <= preempt_) {
+                evict_ += count;
+                count -= count;
+                schedule_.last = vidx;  // ! modifiying `sched_.last` is part of commit
+                break;
+            }
+        }
+
+        if (count == 0) {
+            Commit();
+        }
+        else {
+            schedule_.inactive.push_back(sequences_[index_]);
+        }
+    }
+
+    void Commit()
+    {
+        // update available resources
+        schedule_.free -= allocate_;
+        FT_CHECK(schedule_.free >= 0);
+        schedule_.cached += preempt_;
+        schedule_.cached -= evict_;
+        FT_CHECK(schedule_.cached >= 0);
+
+        // update scheduled operations
+        schedule_.allocate += allocate_;
+        schedule_.evict += evict_;
+        schedule_.preempt += preempt_;
+        schedule_.victims.insert(schedule_.victims.end(), victims_.begin(), victims_.end());
+
+        // update active sequences
+        schedule_.active.push_back(sequences_[index_]);
+        schedule_.block_counts.push_back(block_count_);
+    }
+};
+
+std::ostream& operator<<(std::ostream& os, const Transaction& trans)
+{
+    os << "index=" << trans.index_ << ", block_count=" << trans.block_count_ << ", allocate=" << trans.allocate_
+       << ", evict=" << trans.evict_ << ", preempt=" << trans.preempt_ << ", victims=" << trans.victims_;
+    return os;
+}
+
+}  // namespace
+
+void SequenceManager::SortByPriority(Sequences&                   sequences,
+                                     std::vector<int>&            context_lengths,
+                                     const std::vector<uint64_t>& priorities)
+{
+    // sort according to priority
+    std::vector<int> idxs(sequences.size());
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {
+        return priorities[i] < priorities[j];  //
+    });
+    Sequences        tmp_sequences(sequences.size());
+    std::vector<int> tmp_lengths(context_lengths.size());
+    for (int i = 0; i < sequences.size(); ++i) {
+        tmp_sequences[i] = sequences[idxs[i]];
+        tmp_lengths[i]   = context_lengths[idxs[i]];
+    }
+    sequences.swap(tmp_sequences);
+    context_lengths.swap(tmp_lengths);
+}
+
+std::vector<int> SequenceManager::CountRequiredBlocks(const Sequences&        sequences,
+                                                      const std::vector<int>& context_lengths,
+                                                      int                     step_length)
+{
+    std::vector<int> required(sequences.size());
+    for (int i = 0; i < sequences.size(); ++i) {
+        int seq_len = context_lengths[i] + step_length;
+        int count   = (seq_len + block_seq_len_ - 1) / block_seq_len_ - static_cast<int>(sequences[i]->blocks.size());
+        required[i] = std::max(0, count);
+    }
+    return required;
+}
+
+void SequenceManager::AssignAndActivate(const Sequences&                 sequences,  //
+                                        const std::vector<int>&          counts,
+                                        const std::vector<const Block*>& blocks)
+{
+    FT_CHECK(sequences.size() == counts.size());
+    auto first = blocks.begin();
+    for (int i = 0; i < sequences.size(); ++i) {
+        auto& s     = const_cast<Sequence&>(*sequences[i]);
+        auto  count = counts[i];
+        // dbg(count);
+        auto last = first + count;
+        std::for_each(first, last, [&](const Block* b) {
+            s.blocks.push_back(b);
+            s.block_unique_ids.push_back(b->unique_id);
+        });
+        s.status = Sequence::kActive;
+        first    = last;
+    }
+}
+
+auto SequenceManager::Materialize(Sequences                    sequences,
+                                  std::vector<int>             context_lengths,
+                                  const std::vector<uint64_t>& priorities,
+                                  int                          step_length) -> Outcome
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Schedule the assignment of blocks to sequences
+
+    // process deferred unlock and free operations
+    CommitUnlockAndFree();
+
+    SortByPriority(sequences, context_lengths, priorities);
+
+    // Verify and lock cache sequences to avoid their blocks being evicted unnoticed
+    // the blocks can still be preempted later
+    VerifyAndLockCached(sequences);
+
+    std::vector<int> required = CountRequiredBlocks(sequences, context_lengths, step_length);
+    // dbg(required);
+
+    Schedule schedule(block_manager_->TakeSnapshot(), sequences.size());
+
+    // `schedule.last` is decreasing in the loop
+    for (int i = 0; i < schedule.last; ++i) {
+        Transaction{sequences, i, required[i], schedule}.Process();
+    }
+
+    // mark remaining sequences invalid
+    for (int i = schedule.last; i < sequences.size(); ++i) {
+        schedule.inactive.push_back(sequences[i]);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Schedule is ready, time to execute it. (locked -> cached -> free -> locked)
+
+    // combine allocate and evict since evicted blocks are reused by allocation
+    schedule.allocate += schedule.evict;
+
+    if (schedule.allocate) {
+        dbg(*block_manager_);
+    }
+
+    Outcome outcome{};
+    outcome.allocation = schedule.allocate;
+    outcome.swap_in    = std::count_if(schedule.active.begin(), schedule.active.end(), [](auto p) {
+        if (p->status != Sequence::kActive) {
+            dbg(*p);
+        }
+        return p->status != Sequence::kActive;  //
+    });
+    outcome.swap_out   = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) {
+        if (p->status == Sequence::kActive) {
+            dbg(*p);
+        }
+        return p->status == Sequence::kActive;  //
+    });
+
+    // release preempted blocks -> cached
+    if (!schedule.victims.empty()) {
+        for (const auto& p : schedule.victims) {
+            UpdateAndSetUnlock(*p);
+        }
+        CommitUnlockAndFree();
+    }
+
+    // evict cached blocks -> free
+    if (schedule.evict) {
+        block_manager_->Evict(schedule.evict);
+        need_verify_ = true;
+    }
+
+    // allocate & assign blocks
+    {
+        std::vector<const Block*> blocks;
+        if (schedule.allocate) {
+            blocks = block_manager_->Allocate(schedule.allocate);
+        }
+        AssignAndActivate(schedule.active, schedule.block_counts, blocks);
+    }
+
+    // active -> locked
+    for (const auto& p : schedule.inactive) {
+        if (p->status == Sequence::kActive) {
+            const_cast<Sequence*>(p)->status = Sequence::kLocked;
+        }
+    }
+
+    return outcome;
+}
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/models/llama/BlockManager.h"
+
+namespace turbomind {
+
+struct Sequence {
+
+    enum Status
+    {
+        kCached = 0,
+        kLocked,
+        kActive
+    };
+
+    uint64_t id;
+    Status   status;
+
+    std::vector<const Block*> blocks;
+    std::vector<uint64_t>     block_unique_ids;
+
+    mutable std::vector<int> tokens;  // update by user
+
+    mutable int cache_len;
+
+    // additional data kept round-to-round
+    mutable std::vector<std::byte> random_state;  // update by user
+
+    mutable float rope_theta;
+
+    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
+};
+
+using Sequences = std::vector<const Sequence*>;
+
+inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
+{
+    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
+       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
+       << ", random_state_size=" << seq.random_state.size();
+    return os;
+}
+
+class SequenceManager {
+public:
+    explicit SequenceManager(size_t      layer_num,
+                             size_t      head_num,
+                             size_t      head_dim,
+                             size_t      block_seq_len,
+                             double      block_count,
+                             int         chunk_size,
+                             size_t      elem_bits,
+                             int         rank,
+                             IAllocator* allocator);
+
+    SequenceManager(const SequenceManager&)     = delete;
+    SequenceManager(SequenceManager&&) noexcept = default;
+
+    const Sequence* Create(uint64_t id);
+
+    const Sequence* Get(uint64_t id);
+
+    bool Contains(uint64_t id);
+
+    bool Erase(uint64_t id);
+
+    void UpdateAndSetUnlock(const Sequence& seq);
+
+    struct Outcome {
+        int allocation;
+        int swap_in;
+        int swap_out;
+    };
+
+    Outcome Materialize(Sequences                    sequences,
+                        std::vector<int>             context_lengths,
+                        const std::vector<uint64_t>& priorities,
+                        int                          step_length);
+
+    void* OffsetKey(void* block_ptr)
+    {
+        return block_ptr;
+    }
+
+    void* OffsetVal(void* block_ptr)
+    {
+        return (std::byte*)block_ptr + val_offset_;
+    }
+
+    int max_block_count() const noexcept
+    {
+        return block_manager_->max_block_count();
+    }
+
+private:
+    void CommitUnlockAndFree();
+
+    void VerifyAndLockCached(const Sequences& sequences);
+
+    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
+                                         const std::vector<int>& context_lengths,
+                                         int                     step_length);
+
+    static void SortByPriority(Sequences&                   sequences,  //
+                               std::vector<int>&            context_lengths,
+                               const std::vector<uint64_t>& priorities);
+
+    static void AssignAndActivate(const Sequences&                 sequences,  //
+                                  const std::vector<int>&          block_counts,
+                                  const std::vector<const Block*>& blocks);
+
+private:
+    int    block_seq_len_;
+    int    rank_;
+    size_t val_offset_{};
+
+    bool need_verify_{};
+
+    // Use `std::map` to avoid reference invalidation
+    std::map<uint64_t, Sequence> sequences_;
+
+    std::unique_ptr<BlockManager> block_manager_;
+
+    std::vector<const Block*> unlocked_;
+    std::vector<const Block*> freed_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
+{
+    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
+    return os;
+}
+
+}  // namespace turbomind
--- a/src/turbomind/models/llama/llama_kernels.cu
+++ b/src/turbomind/models/llama/llama_kernels.cu
 // Copyright (c) OpenMMLab. All rights reserved.

 #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/turbomind/kernels/decoder_multihead_attention/array_ops.h"
+#include "src/turbomind/kernels/gemm_s_f16/common.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/logger.h"
+#include <type_traits>

 namespace turbomind {

@@ -199,392 +203,248 @@ void invokeCreateCausalMasks(
 template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
 template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);

-template<typename T>
-__global__ void extend_key_cache(T**          k_dst,
-                                 const size_t dst_offset,
-                                 const T*     k_src,
-                                 const int    head_num,
-                                 const int    size_per_head,
-                                 const int*   query_length,
-                                 const int*   history_length,
-                                 const int    max_q_len,
-                                 const int    max_seq_len)
-{
-    const int     batch_id = blockIdx.y;
-    const int     head_id  = blockIdx.z;
-    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+template<typename Ti, typename To>
+struct ExtendKvCache {

-    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
-    int       size_per_head_div_x = size_per_head / X_ELEMS;
+    static constexpr int MaxElemSize = std::max(sizeof(Ti), sizeof(To));
+    static constexpr int X_ELEMS     = 16 / MaxElemSize;

-    // x dim is now handled by uint4 type
-    const auto key_src = reinterpret_cast<const uint4*>(k_src);
-    const auto key_dst = reinterpret_cast<uint4*>(k_dst[batch_id] + dst_offset);
+    using Vi = Array<Ti, X_ELEMS>;
+    using Vo = Array<To, X_ELEMS>;

-    const auto seq_len  = query_length[batch_id];
-    const auto t_offset = history_length[batch_id];
+    using Transform = ConvertKvCache<Ti, To>;

-    const int k_head_size_id = idx % size_per_head_div_x;
-    const int k_seq_len_id   = idx / size_per_head_div_x;
+    struct Params {
+        To**       k_dst_ptrs;
+        To**       v_dst_ptrs;
+        const Ti*  k_src;
+        const Ti*  v_src;
+        const int* cu_block_counts;
+        const int* query_length;
+        const int* context_length;
+        int        block_length;
+        size_t     dst_layer_offset;
+        int        max_q_len;
+        int        head_num;
+        int        head_dim;
+        Transform  transform_k;
+        Transform  transform_v;
+    };

-    if (k_seq_len_id < seq_len) {
-        // [B, H, s, D/x] -> [H, D/x, S[t:t+s]]
+    __device__ void operator()(const Params& params) const
+    {
+        const int batch_id = blockIdx.y;

-        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +  // H
-                                k_head_size_id * max_seq_len +                 // D/x
-                                t_offset + k_seq_len_id;                       // s + offset
+        const int query_len    = params.query_length[batch_id];
+        const int history_len  = params.context_length[batch_id] - query_len;
+        const int cu_block_cnt = params.cu_block_counts[batch_id];

-        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
-                                head_id * size_per_head_div_x * max_q_len +              // H
-                                k_seq_len_id * size_per_head_div_x +                     // s
-                                k_head_size_id;                                          // D/x
+        const int head_id = blockIdx.z;

-        key_dst[dst_idx] = key_src[src_idx];
-    }
-}
+        const int size_per_head_div_x = params.head_dim / X_ELEMS;
+        const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+        const int head_size_id        = idx % size_per_head_div_x;
+        const int seq_len_id          = idx / size_per_head_div_x;

-template<typename T>
-__global__ void extend_value_cache(T**          v_dst,
-                                   const size_t dst_offset,
-                                   const T*     v_src,
-                                   const int    head_num,
-                                   const int    size_per_head,
-                                   const int*   query_length,
-                                   const int*   history_length,
-                                   const int    max_q_len,
-                                   const int    max_seq_len)
-{
-    const int     batch_id = blockIdx.y;
-    const int     head_id  = blockIdx.z;
-    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+        const int cache_block_index  = (seq_len_id + history_len) / params.block_length;
+        const int cache_block_offset = (seq_len_id + history_len) % params.block_length;

-    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
-    int       size_per_head_div_x = size_per_head / X_ELEMS;
+        const auto k_val_src = params.k_src;
+        const auto v_val_src = params.v_src;

-    // x dim is now handled by uint4 type
-    const auto val_src = reinterpret_cast<const uint4*>(v_src);
-    const auto val_dst = reinterpret_cast<uint4*>(v_dst[batch_id] + dst_offset);
+        const auto k_val_dst = (params.k_dst_ptrs + cu_block_cnt)[cache_block_index] + params.dst_layer_offset;
+        const auto v_val_dst = (params.v_dst_ptrs + cu_block_cnt)[cache_block_index] + params.dst_layer_offset;

-    const auto seq_len  = query_length[batch_id];
-    const auto t_offset = history_length[batch_id];
+        if (seq_len_id < query_len) {
+            // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
+            const int64_t dst_idx = head_id * params.block_length * size_per_head_div_x +  // H
+                                    cache_block_offset * size_per_head_div_x +             // s + offset
+                                    head_size_id;                                          // D/x

-    const int v_head_size_id = idx % size_per_head_div_x;
-    const int v_seq_len_id   = idx / size_per_head_div_x;
+            const int64_t src_idx = batch_id * params.head_num * params.max_q_len * size_per_head_div_x +  // B
+                                    head_id * params.max_q_len * size_per_head_div_x +                     // H
+                                    seq_len_id * size_per_head_div_x +                                     // s
+                                    head_size_id;                                                          // D/x

-    if (v_seq_len_id < seq_len) {
-        // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
-        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +      // H
-                                (v_seq_len_id + t_offset) * size_per_head_div_x +  // s + offset
-                                v_head_size_id;                                    // D/x
+            Vi k_vi;
+            Vi v_vi;

-        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
-                                head_id * size_per_head_div_x * max_q_len +              // H
-                                v_seq_len_id * size_per_head_div_x +                     // s
-                                v_head_size_id;                                          // D/x
+            Ldg(k_vi, k_val_src + src_idx * X_ELEMS);
+            Ldg(v_vi, v_val_src + src_idx * X_ELEMS);

-        val_dst[dst_idx] = val_src[src_idx];
-    }
-}
+            Vo k_vo = params.transform_k(k_vi);
+            Vo v_vo = params.transform_v(v_vi);

-inline __device__ float2 float2div(float a, float2 b)
-{
-    float2 c;
-    c.x = b.x / a;
-    c.y = b.y / a;
-    return c;
-}
-
-inline __device__ float2 float2sub(float zp, float2 val)
-{
-    float2 ret;
-    ret.x = val.x - zp;
-    ret.y = val.y - zp;
-    return ret;
-}
-
-static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale, const float zp)
-{
-    half4 dst;
-    dst.x = __float2half(value.x * scale + zp);
-    dst.y = __float2half(value.y * scale + zp);
-    dst.z = __float2half(value.z * scale + zp);
-    dst.w = __float2half(value.w * scale + zp);
-    return dst;
-}
+            Store(k_val_dst + dst_idx * X_ELEMS, k_vo);
+            Store(v_val_dst + dst_idx * X_ELEMS, v_vo);
+        }
+    }
+};

-static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
-{
-    uint32_t dst;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
-    uint32_t a;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
-    uint32_t b;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
-    uint32_t c;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
-    uint32_t d;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
-
-    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2,  0;\n" : "=r"(dst) : "r"(d), "r"(c));
-    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
-#else
-    char4 tmp;
-    tmp.x       = x;
-    tmp.y       = y;
-    tmp.z       = z;
-    tmp.w       = w;
-    dst         = reinterpret_cast<const uint32_t&>(tmp);
-#endif
-    return dst;
-}
+namespace {

-template<typename T>
-__global__ void extend_value_cache_int8(int8_t**     v_dst,
-                                        const size_t dst_offset,
-                                        const T*     v_src,
-                                        const int    head_num,
-                                        const int    size_per_head,
-                                        const int*   query_length,
-                                        const int*   history_length,
-                                        const int    max_q_len,
-                                        const int    max_seq_len,
-                                        const float  v_scale,
-                                        const float  v_zp)
+template<class Kernel, class Params>
+__global__ void KernelWrapper(Params params)
 {
-    const int     batch_id = blockIdx.y;
-    const int     head_id  = blockIdx.z;
-    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
-
-    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
-    int       size_per_head_div_x = size_per_head / X_ELEMS;
-
-    // x dim is now handled by uint4 type
-    const auto val_src = reinterpret_cast<const uint4*>(v_src);
-    const auto val_dst = reinterpret_cast<uint2*>(v_dst[batch_id] + dst_offset);
-
-    const auto seq_len  = query_length[batch_id];
-    const auto t_offset = history_length[batch_id];
-
-    const int v_head_size_id = idx % size_per_head_div_x;
-    const int v_seq_len_id   = idx / size_per_head_div_x;
-
-    if (v_seq_len_id < seq_len) {
-        // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
-        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +      // H
-                                (v_seq_len_id + t_offset) * size_per_head_div_x +  // s + offset
-                                v_head_size_id;                                    // D/x
-
-        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
-                                head_id * size_per_head_div_x * max_q_len +              // H
-                                v_seq_len_id * size_per_head_div_x +                     // s
-                                v_head_size_id;                                          // D/x
+    Kernel{}(params);
+};

-        // scale to int8 and write
-        const auto value  = val_src[src_idx];
-        auto       to_ptr = reinterpret_cast<uint32_t*>(val_dst + dst_idx);
-
-        float2 float2_0 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.x)));
-        float2 float2_1 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.y)));
-        to_ptr[0]       = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
-
-        float2_0  = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.z)));
-        float2_1  = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.w)));
-        to_ptr[1] = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
-    }
-}
+}  // namespace

 template<typename T>
-void invokeExtendKVCache(T**          k_dst,
-                         T**          v_dst,
-                         size_t       dst_offset,
+void invokeExtendKVCache(void**       k_dst_ptrs,
+                         void**       v_dst_ptrs,
                         const T*     k_src,
                         const T*     v_src,
-                         int          local_batch_size,
+                         const int*   cu_block_counts,
                         const int*   query_length,
+                         const int*   context_length,
+                         int          batch_size,
+                         int          block_length,
+                         size_t       dst_layer_offset,
                         int          max_q_len,
-                         const int*   history_length,
-                         int          max_seq_len,
-                         int          size_per_head,
-                         int          local_head_num,
-                         cudaStream_t stream,
+                         int          head_dim,
+                         int          head_num,
                         int          quant,
-                         const float* kv_scale)
+                         const float* kv_params,
+                         cudaStream_t stream)
 {
    constexpr int block_sz = 128;
-    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-
-    dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
-
-    if (quant & QuantPolicy::kCacheKVInt8) {
-        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(k_dst),
-                                                               dst_offset,
-                                                               k_src,
-                                                               local_head_num,
-                                                               size_per_head,
-                                                               query_length,
-                                                               history_length,
-                                                               max_q_len,
-                                                               max_seq_len,
-                                                               kv_scale[0],
-                                                               kv_scale[1]);
-
-        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(v_dst),
-                                                               dst_offset,
-                                                               v_src,
-                                                               local_head_num,
-                                                               size_per_head,
-                                                               query_length,
-                                                               history_length,
-                                                               max_q_len,
-                                                               max_seq_len,
-                                                               kv_scale[2],
-                                                               kv_scale[3]);
-    }
-    else {
-        extend_value_cache<<<grid, block_sz, 0, stream>>>(k_dst,
-                                                          dst_offset,
-                                                          k_src,
-                                                          local_head_num,
-                                                          size_per_head,
-                                                          query_length,
-                                                          history_length,
-                                                          max_q_len,
-                                                          max_seq_len);
-
-        extend_value_cache<<<grid, block_sz, 0, stream>>>(v_dst,
-                                                          dst_offset,
-                                                          v_src,
-                                                          local_head_num,
-                                                          size_per_head,
-                                                          query_length,
-                                                          history_length,
-                                                          max_q_len,
-                                                          max_seq_len);
-    }
-}
-
-template void invokeExtendKVCache(float**,
-                                  float**,
-                                  size_t,
-                                  const float*,
-                                  const float*,
-                                  int,
-                                  const int*,
-                                  int,
-                                  const int*,
-                                  int,
-                                  int,
-                                  int,
-                                  cudaStream_t stream,
-                                  int,
-                                  const float*);
-
-template void invokeExtendKVCache(half**,
-                                  half**,
-                                  size_t,
-                                  const half*,
-                                  const half*,
-                                  int,
-                                  const int*,
-                                  int,
-                                  const int*,
-                                  int,
-                                  int,
-                                  int,
-                                  cudaStream_t stream,
-                                  int,
-                                  const float*);
-
-template<typename T>
-__global__ void transpose_value_cache(T*           v_dst,  //
-                                      const T**    v_src,
-                                      const size_t src_offset,
-                                      const int    head_num,
-                                      const int    head_n_rep,
-                                      const int    size_per_head,
-                                      const int*   seq_length,
-                                      const int    max_kv_len,
-                                      const int    max_seq_len)
-{
-    const int     batch_id = blockIdx.y;
-    const int     head_id  = blockIdx.z;
-    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
-
-    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
-    int       size_per_head_div_x = size_per_head / X_ELEMS;
-
-    // x dim is now handled by uint4 type
-    const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
-    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
-
-    const auto seq_len = seq_length[batch_id];

-    const int v_head_size_id = idx % size_per_head_div_x;
-    const int v_seq_len_id   = idx / size_per_head_div_x;
-
-    if (v_seq_len_id < seq_len) {
-        // [B, H, s, D/x] <- [B, H, S[:s], D/x]
-        const int64_t src_idx = head_id / head_n_rep * size_per_head_div_x * max_seq_len +  // H
-                                v_seq_len_id * size_per_head_div_x +                        // s
-                                v_head_size_id;                                             // D/x
-
-        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
-                                head_id * size_per_head_div_x * max_kv_len +              // H
-                                v_seq_len_id * size_per_head_div_x +                      // s
-                                v_head_size_id;                                           // D/x
-
-        val_dst[dst_idx] = val_src[src_idx];
-    }
+    auto fn = [&](auto value) {
+        using Tout   = decltype(value);
+        using Kernel = ExtendKvCache<T, Tout>;
+
+        dim3 grid((max_q_len * head_dim / Kernel::X_ELEMS + block_sz - 1) / block_sz, batch_size, head_num);
+
+        typename Kernel::Params params{(Tout**)k_dst_ptrs,
+                                       (Tout**)v_dst_ptrs,
+                                       k_src,
+                                       v_src,
+                                       cu_block_counts,
+                                       query_length,
+                                       context_length,
+                                       block_length,
+                                       dst_layer_offset,
+                                       max_q_len,
+                                       head_num,
+                                       head_dim,
+                                       {kv_params[0], kv_params[1]},
+                                       {kv_params[2], kv_params[3]}};
+
+        KernelWrapper<Kernel><<<grid, block_sz, 0, stream>>>(params);
+    };
+
+    (quant & QuantPolicy::kCacheKVInt8) ? fn(int8_t{}) : fn(T{});
 }

-template<typename T>
-__global__ void transpose_value_cache_int8(T*             v_dst,  //
-                                           const int8_t** v_src,
-                                           const size_t   src_offset,
-                                           const int      head_num,
-                                           const int      head_n_rep,
-                                           const int      size_per_head,
-                                           const int*     seq_length,
-                                           const int      max_kv_len,
-                                           const int      max_seq_len,
-                                           const float    v_scale,
-                                           const float    v_zp)
-{
-    const int     batch_id = blockIdx.y;
-    const int     head_id  = blockIdx.z;
-    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
-
-    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
-    int       size_per_head_div_x = size_per_head / X_ELEMS;
-
-    // x dim is now handled by uint4 type
-    const auto val_src = reinterpret_cast<const uint2*>(v_src[batch_id] + src_offset);
-    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
-
-    const auto seq_len = seq_length[batch_id];
-
-    const int v_head_size_id = idx % size_per_head_div_x;
-    const int v_seq_len_id   = idx / size_per_head_div_x;
-
-    if (v_seq_len_id < seq_len) {
-        // [B, H, s, D/x] <- [B, H, S[:s], D/x]
-        const int64_t src_idx = head_id / head_n_rep * size_per_head_div_x * max_seq_len +  // H
-                                v_seq_len_id * size_per_head_div_x +                        // s
-                                v_head_size_id;                                             // D/x
-
-        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
-                                head_id * size_per_head_div_x * max_kv_len +              // H
-                                v_seq_len_id * size_per_head_div_x +                      // s
-                                v_head_size_id;                                           // D/x
-
-        // int8x8 -> fp16x8
-        const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx);
-        auto       to_ptr   = reinterpret_cast<half4*>(val_dst + dst_idx);
-
-        to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale, v_zp);
-        to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale, v_zp);
+template void invokeExtendKVCache(void**       k_dst_ptrs,
+                                  void**       v_dst_ptrs,
+                                  const float* k_src,
+                                  const float* v_src,
+                                  const int*   cu_block_counts,
+                                  const int*   query_length,
+                                  const int*   history_length,
+                                  int          batch_size,
+                                  int          block_length,
+                                  size_t       dst_layer_offset,
+                                  int          max_q_len,
+                                  int          head_dim,
+                                  int          head_num,
+                                  int          quant,
+                                  const float* kv_scale,
+                                  cudaStream_t stream);
+
+template void invokeExtendKVCache(void**       k_dst_ptrs,
+                                  void**       v_dst_ptrs,
+                                  const half*  k_src,
+                                  const half*  v_src,
+                                  const int*   cu_block_counts,
+                                  const int*   query_length,
+                                  const int*   history_length,
+                                  int          batch_size,
+                                  int          block_length,
+                                  size_t       dst_layer_offset,
+                                  int          max_q_len,
+                                  int          head_dim,
+                                  int          head_num,
+                                  int          quant,
+                                  const float* kv_scale,
+                                  cudaStream_t stream);
+
+template<typename Ti, typename To>
+struct TransposeKvCache {
+    static constexpr int MaxElemSize = std::max(sizeof(Ti), sizeof(To));
+    static constexpr int X_ELEMS     = 16 / MaxElemSize;
+
+    using Vi = Array<Ti, X_ELEMS>;
+    using Vo = Array<To, X_ELEMS>;
+
+    using Transform = ConvertKvCache<Ti, To>;
+
+    struct Params {
+        To*        k_dst;
+        To*        v_dst;
+        const Ti** k_src;
+        const Ti** v_src;
+        size_t     src_offset;
+        int        head_num;
+        int        head_n_rep;
+        int        size_per_head;
+        const int* seq_length;
+        int        max_kv_len;
+        int        max_seq_len;
+        Transform  transform_k;
+        Transform  transform_v;
+        // float      k_scale;
+        // float      k_zp;
+        // float      v_scale;
+        // float      v_zp;
+    };
+
+    __device__ void operator()(const Params& params) const
+    {
+        const int batch_id = blockIdx.y;
+        const int head_id  = blockIdx.z;
+
+        const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+        const int size_per_head_div_x = params.size_per_head / X_ELEMS;
+
+        const auto k_src = params.k_src[batch_id] + params.src_offset;
+        const auto v_src = params.v_src[batch_id] + params.src_offset;
+        const auto k_dst = params.k_dst;
+        const auto v_dst = params.v_dst;
+
+        const auto seq_len = params.seq_length[batch_id];
+
+        const int v_head_size_id = idx % size_per_head_div_x;
+        const int v_seq_len_id   = idx / size_per_head_div_x;
+
+        if (v_seq_len_id < seq_len) {
+            // [B, H, s, D/x] <- [B, H, S[:s], D/x]
+            const int64_t src_idx = head_id / params.head_n_rep * size_per_head_div_x * params.max_seq_len +  // H
+                                    v_seq_len_id * size_per_head_div_x +                                      // s
+                                    v_head_size_id;                                                           // D/x
+
+            const int64_t dst_idx = batch_id * params.head_num * size_per_head_div_x * params.max_kv_len +  // B
+                                    head_id * size_per_head_div_x * params.max_kv_len +                     // H
+                                    v_seq_len_id * size_per_head_div_x +                                    // s
+                                    v_head_size_id;                                                         // D/x
+
+            Vi k_vi;
+            Vi v_vi;
+
+            Ldg(k_vi, k_src + src_idx * X_ELEMS);
+            Ldg(v_vi, v_src + src_idx * X_ELEMS);
+
+            Vo k_vo = params.transform_k(k_vi);
+            Vo v_vo = params.transform_v(v_vi);
+
+            Store(k_dst + dst_idx * X_ELEMS, k_vo);
+            Store(v_dst + dst_idx * X_ELEMS, v_vo);
+        }
    }
-}
+};

 template<typename T>
 void invokeTransposeKVCache(T*           key_cache_trans,
@@ -601,59 +461,34 @@ void invokeTransposeKVCache(T*           key_cache_trans,
                            int          head_n_rep,
                            cudaStream_t stream,
                            int          quant,
-                            const float* kv_scale)
+                            const float* kv_params)
 {
    constexpr int block_sz = 128;
-    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
-
-    dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
-
-    if (quant & QuantPolicy::kCacheKVInt8) {
-        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(key_cache_trans,
-                                                                  reinterpret_cast<const int8_t**>(key_cache),
-                                                                  src_offset,
-                                                                  head_num,
-                                                                  head_n_rep,
-                                                                  size_per_head,
-                                                                  key_length,
-                                                                  max_kv_len,
-                                                                  max_seq_len,
-                                                                  kv_scale[0],
-                                                                  kv_scale[1]);
-
-        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(val_cache_trans,
-                                                                  reinterpret_cast<const int8_t**>(val_cache),
-                                                                  src_offset,
-                                                                  head_num,
-                                                                  head_n_rep,
-                                                                  size_per_head,
-                                                                  key_length,
-                                                                  max_kv_len,
-                                                                  max_seq_len,
-                                                                  kv_scale[2],
-                                                                  kv_scale[3]);
-    }
-    else {
-        transpose_value_cache<<<grid, block_sz, 0, stream>>>(key_cache_trans,
-                                                             key_cache,
-                                                             src_offset,
-                                                             head_num,
-                                                             head_n_rep,
-                                                             size_per_head,
-                                                             key_length,
-                                                             max_kv_len,
-                                                             max_seq_len);
-
-        transpose_value_cache<<<grid, block_sz, 0, stream>>>(val_cache_trans,
-                                                             val_cache,
-                                                             src_offset,
-                                                             head_num,
-                                                             head_n_rep,
-                                                             size_per_head,
-                                                             key_length,
-                                                             max_kv_len,
-                                                             max_seq_len);
-    }
+
+    auto fn = [&](auto value) {
+        using Tin    = decltype(value);
+        using Kernel = TransposeKvCache<Tin, T>;
+
+        dim3 grid((max_kv_len * size_per_head / Kernel::X_ELEMS + block_sz - 1) / block_sz, batch_size, head_num);
+
+        typename Kernel::Params params{key_cache_trans,
+                                       val_cache_trans,
+                                       (const Tin**)key_cache,
+                                       (const Tin**)val_cache,
+                                       src_offset,
+                                       head_num,
+                                       head_n_rep,
+                                       size_per_head,
+                                       key_length,
+                                       max_kv_len,
+                                       max_seq_len,
+                                       {kv_params[0], kv_params[1]},
+                                       {kv_params[2], kv_params[3]}};
+
+        KernelWrapper<Kernel><<<grid, block_sz, 0, stream>>>(params);
+    };
+
+    (quant & QuantPolicy::kCacheKVInt8) ? fn(int8_t{}) : fn(T{});
 }

 template void invokeTransposeKVCache(float*,
@@ -718,12 +553,59 @@ void invokeGatherOutput(int*         output_ids,
                        int          batch_size,
                        cudaStream_t stream)
 {
-    int block_size = 512;
+    int block_size = 128;
    int grid_size  = batch_size;
    gatherOutput<<<grid_size, block_size, 0, stream>>>(
        output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
 }

+__global__ void updateOutput(int**      request_output_ids_ptrs,
+                             int**      request_seqlen_ptrs,
+                             const int* output_ids,
+                             const int* sequence_lengths,
+                             const int* request_output_ids_lens,
+                             int        max_session_len,
+                             bool       token_generated)
+{
+    const int batch_id = blockIdx.x;
+
+    auto request_output_ids = request_output_ids_ptrs[batch_id];
+    auto request_seqlen     = request_seqlen_ptrs[batch_id];
+
+    output_ids += max_session_len * batch_id;
+
+    const int seqlen     = sequence_lengths[batch_id] + (int)token_generated;
+    const int output_len = min(seqlen, request_output_ids_lens[batch_id]);
+
+    for (int i = threadIdx.x; i < output_len; i += blockDim.x) {
+        request_output_ids[i] = output_ids[i];
+    }
+
+    *request_seqlen = seqlen;
+}
+
+void invokeUpdateOutput(int**        request_output_ids_ptrs,
+                        int**        request_seqlen_ptrs,
+                        const int*   output_ids,
+                        const int*   sequence_lengths,
+                        const int*   request_output_ids_lens,
+                        int          max_session_len,
+                        bool         token_generated,
+                        int          batch_size,
+                        cudaStream_t stream)
+{
+    constexpr int block_size = 128;
+    const int     grid_size  = batch_size;
+
+    updateOutput<<<grid_size, block_size, 0, stream>>>(request_output_ids_ptrs,
+                                                       request_seqlen_ptrs,
+                                                       output_ids,
+                                                       sequence_lengths,
+                                                       request_output_ids_lens,
+                                                       max_session_len,
+                                                       token_generated);
+}
+
 #define VERSION_SWITCH(VERSION, CONST_NAME, ...)                                                                       \
    [&] {                                                                                                              \
        if (VERSION == 2) {                                                                                            \

--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -34,21 +34,22 @@ void invokeCreateCausalMasks(
    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);

 template<typename T>
-void invokeExtendKVCache(T**          k_dst,
-                         T**          v_dst,
-                         size_t       layer_offset,
+void invokeExtendKVCache(void**       k_dst_ptrs,
+                         void**       v_dst_ptrs,
                         const T*     k_src,
                         const T*     v_src,
-                         int          batch_size,
+                         const int*   cu_block_counts,
                         const int*   query_length,
+                         const int*   context_length,
+                         int          batch_size,
+                         int          block_length,
+                         size_t       dst_layer_offset,
                         int          max_q_len,
-                         const int*   history_length,
-                         int          max_seq_len,
-                         int          size_per_head,
-                         int          local_head_num,
-                         cudaStream_t stream,
+                         int          head_dim,
+                         int          head_num,
                         int          quant,
-                         const float* kv_scale);
+                         const float* kv_scale,
+                         cudaStream_t stream);

 template<typename T>
 void invokeTransposeKVCache(T*           key_cache_trans,
@@ -76,6 +77,16 @@ void invokeGatherOutput(int*         output_ids,
                        int          batch_size,
                        cudaStream_t stream);

+void invokeUpdateOutput(int**        request_output_ids_ptrs,
+                        int**        request_seqlen_ptrs,
+                        const int*   output_ids,
+                        const int*   sequence_lengths,
+                        const int*   request_output_ids_lens,
+                        int          max_session_len,
+                        bool         token_generated,
+                        int          batch_size,
+                        cudaStream_t stream);
+
 void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);

 template<typename T>

--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -5,11 +5,12 @@
 namespace turbomind {

 struct LlamaAttentionParams {
-    int   rotray_embedding_dim;
+    int   rotary_embedding_dim;
    float rotary_embedding_base;
    int   max_position_embeddings;
-    bool  use_dynamic_ntk;
-    bool  use_logn_attn;
+    float rope_scaling_factor;
+    // bool  use_dynamic_ntk;
+    bool use_logn_attn;
 };

 }  // namespace turbomind
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -157,4 +157,13 @@ bool isDebug()
    return is_debug;
 }

+int64_t& gSequenceIds(int batch_idx)
+{
+    thread_local std::vector<int64_t> ids{};
+    if (batch_idx >= ids.size()) {
+        ids.resize(batch_idx + 1, -1);
+    }
+    return ids.at(batch_idx);
+}
+
 }  // namespace turbomind
--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -2,6 +2,7 @@

 #pragma once
 #include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/nvtx_utils.h"
 #include <cuda_runtime.h>
 #include <sstream>
 #include <string>
@@ -66,4 +67,18 @@ size_t curandStateGetSize();

 bool isDebug();

+struct NvtxScope {
+    explicit NvtxScope(const std::string& name)
+    {
+        PUSH_RANGE(name.c_str());
+    }
+
+    ~NvtxScope()
+    {
+        POP_RANGE;
+    }
+};
+
+int64_t& gSequenceIds(int batch_idx);
+
 }  // namespace turbomind
--- a/src/turbomind/models/llama/test_cache_manager.cc
+++ b/src/turbomind/models/llama/test_cache_manager.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "BlockManager.h"
+#include "SequenceManager.h"
+
+#include "src/turbomind/utils/allocator.h"
+
+#include "src/turbomind/utils/debug_utils.h"
+#include <catch2/catch_test_macros.hpp>
+#include <iterator>
+
+using namespace turbomind;
+
+std::ostream& operator<<(std::ostream& os, const Block* b)
+{
+    os << "(" << b->id << "," << b->timestamp << ")";
+    return os;
+}
+
+TEST_CASE("BlockManager")
+{
+    Allocator<AllocatorType::CUDA> allocator(0);
+
+    BlockManager m(1024, 32, 8, &allocator);
+    REQUIRE(m.max_block_count() == 32);
+    REQUIRE(m.free_count() == 32);
+
+    auto blocks1 = m.Allocate(10);
+
+    dbg(blocks1);
+
+    REQUIRE(blocks1.size() == 10);
+    REQUIRE(m.active_count() == blocks1.size());
+    REQUIRE(m.free_count() == 22);
+
+    auto blocks2 = m.Allocate(6);
+    REQUIRE(blocks2.size() == 6);
+    REQUIRE(m.active_count() == blocks1.size() + blocks2.size());
+    REQUIRE(m.free_count() == 16);
+
+    auto blocks3 = m.Allocate(16);
+    REQUIRE(blocks3.size() == 16);
+    REQUIRE(m.active_count() == 32);
+    REQUIRE(m.free_count() == 0);
+
+    std::copy(blocks3.begin(), blocks3.end(), std::back_inserter(blocks1));
+    std::copy(blocks2.begin(), blocks2.end(), std::back_inserter(blocks1));
+
+    m.Touch(blocks1);
+
+    REQUIRE(m.Unlock(blocks1) == 32);
+    REQUIRE(m.active_count() == 0);
+    REQUIRE(m.free_count() == 0);
+    REQUIRE(m.cached_count() == 32);
+
+    m.Evict(16);
+    REQUIRE(m.active_count() == 0);
+    REQUIRE(m.free_count() == 16);
+    REQUIRE(m.cached_count() == 16);
+
+    auto blocks4 = m.Allocate(14);
+    REQUIRE(m.active_count() == 14);
+    REQUIRE(m.free_count() == 2);
+    REQUIRE(m.cached_count() == 16);
+}
+
+TEST_CASE("SequenceManager basic test")
+{
+    Allocator<AllocatorType::CUDA> allocator(0);
+
+    SequenceManager manager(32, 32, 128, 128, 20, 4, 16, 0, &allocator);
+
+    REQUIRE(manager.max_block_count() == 20);
+    REQUIRE(manager.Contains(1) == false);
+
+    auto s1 = manager.Create(1);
+    dbg(*s1);
+    REQUIRE(manager.Contains(1) == true);
+
+    manager.Erase(1);
+    REQUIRE(manager.Contains(1) == false);
+
+    s1 = manager.Create(1);
+    REQUIRE(manager.Contains(1) == true);
+
+    auto outcome = manager.Materialize({s1}, {128}, {100}, 1);
+    dbg(s1->blocks);
+    REQUIRE(s1->blocks.size() == 2);
+
+    auto s2 = manager.Create(2);
+    REQUIRE(manager.Contains(2));
+
+    outcome = manager.Materialize({s1, s2}, {128, 2559}, {2, 1}, 1);
+    dbg(outcome);
+    REQUIRE(outcome.allocation == 20);
+    REQUIRE(outcome.swap_in == 1);
+    REQUIRE(outcome.swap_out == 1);
+
+    auto s3 = manager.Create(3);
+    outcome = manager.Materialize({s1, s2, s3}, {127, 2559, 255}, {1, 100, 2}, 1);
+    dbg(outcome);
+}
+
+TEST_CASE("SequenceManager functional test")
+{
+    Allocator<AllocatorType::CUDA> allocator(0);
+    SequenceManager                manager(32, 32, 128, 128, 20, 4, 16, 0, &allocator);
+
+    auto seq = manager.Create(1);
+    for (int i = 0; i < 1024; ++i) {
+        auto outcome = manager.Materialize({seq}, {i}, {0}, 1);
+        if (outcome.allocation) {
+            dbg(i, outcome);
+        }
+    }
+}
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -65,7 +65,7 @@ void LlamaTritonModel<T>::handleMissingParams()
    }

    if (!max_batch_size_) {
-        max_batch_size_ = 32;
+        max_batch_size_ = 64;
        TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
    }

@@ -74,6 +74,12 @@ void LlamaTritonModel<T>::handleMissingParams()
        TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
    }

+    if (!attn_params_.max_position_embeddings) {
+        attn_params_.max_position_embeddings = session_len_;
+        TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to `session_len` (%d).",
+                       (int)attn_params_.max_position_embeddings);
+    }
+
    if (!max_context_token_num_) {
        max_context_token_num_ = (int)std::sqrt(max_batch_size_);
        TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
@@ -85,14 +91,18 @@ void LlamaTritonModel<T>::handleMissingParams()
        TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
    }

-    if (!cache_max_entry_count_) {
-        cache_max_entry_count_ = 32;
-        TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
-                       (int)cache_max_entry_count_);
+    if (!cache_max_block_count_) {
+        cache_max_block_count_ = .95f;
+        TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.", cache_max_block_count_);
+    }
+
+    if (!cache_block_seq_len_) {
+        cache_block_seq_len_ = 128;
+        TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.", cache_block_seq_len_);
    }

    if (!cache_chunk_size_) {
-        cache_chunk_size_ = cache_max_entry_count_;
+        cache_chunk_size_ = cache_max_block_count_;
        TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
    }
 }
@@ -129,17 +139,21 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
    max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
    session_len_           = reader.GetInteger("llama", "session_len", 0);
    step_length_           = reader.GetInteger("llama", "step_length", 0);
-    cache_max_entry_count_ = reader.GetInteger("llama", "cache_max_entry_count", 0);
-    use_context_fmha_      = reader.GetInteger("llama", "use_context_fmha", 1);
+    cache_max_block_count_ = reader.GetFloat("llama", "cache_max_entry_count", 0);
+    cache_block_seq_len_   = reader.GetInteger("llama", "cache_block_seq_len", 0);
    cache_chunk_size_      = reader.GetInteger("llama", "cache_chunk_size", 0);
-    attn_bias_             = reader.GetInteger("llama", "attn_bias", 0);
-    quant_policy_          = reader.GetInteger("llama", "quant_policy", 0);
-    group_size_            = reader.GetInteger("llama", "group_size", 0);
+    use_context_fmha_      = reader.GetInteger("llama", "use_context_fmha", 1);
+
+    attn_bias_    = reader.GetInteger("llama", "attn_bias", 0);
+    quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
+    group_size_   = reader.GetInteger("llama", "group_size", 0);

-    attn_params_.rotray_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
+    // rotary embedding parameters
+    attn_params_.rotary_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
    attn_params_.rotary_embedding_base   = reader.GetFloat("llama", "rope_theta", 10000.0f);
+    attn_params_.rope_scaling_factor     = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
    attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
-    attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
+    // attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
    attn_params_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);

    handleMissingParams();
@@ -219,7 +233,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];

    ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
-    ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
+    ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);

    auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
                                                  kv_head_num_,
@@ -235,7 +249,8 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
                                                  step_length_,
                                                  start_id_,
                                                  end_id_,
-                                                  cache_max_entry_count_,
+                                                  cache_max_block_count_,
+                                                  cache_block_seq_len_,
                                                  cache_chunk_size_,
                                                  quant_policy_,
                                                  use_context_fmha_,
@@ -320,12 +335,13 @@ std::string LlamaTritonModel<T>::toString()
       << "\ninter_size: " << inter_size_ << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_
       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << max_batch_size_
       << "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
-       << "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_entry_count_
-       << "\ncache_chunk_size: " << cache_chunk_size_ << "\nuse_context_fmha: " << use_context_fmha_
-       << "\nstart_id: " << start_id_ << "\ntensor_para_size: " << tensor_para_size_
-       << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
-       << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_
-       << "\ngroup_size: " << group_size_ << std::endl;
+       << "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_block_count_
+       << "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << cache_chunk_size_
+       << "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
+       << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
+       << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << "\ngroup_size: " << group_size_
+       << std::endl;

    return ss.str();
 }

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -93,7 +93,8 @@ private:
    int                             step_length_;
    int                             start_id_;
    int                             end_id_;
-    int                             cache_max_entry_count_;
+    float                           cache_max_block_count_;
+    int                             cache_block_seq_len_;
    int                             cache_chunk_size_;
    int                             use_context_fmha_;
    size_t                          tensor_para_size_;

--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -272,6 +272,7 @@ struct AbstractTransformerModelInstance;

 struct AbstractTransformerModelInstance {
    virtual ~AbstractTransformerModelInstance() = default;
+
    virtual std::shared_ptr<std::vector<triton::Tensor>>
    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;


--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -131,7 +131,7 @@ void check(T result, char const* const func, const char* const file, int const l
 inline void syncAndCheck(const char* const file, int const line)
 {
    // When FT_DEBUG_LEVEL=DEBUG, must check error
-    static char* level_name = std::getenv("FT_DEBUG_LEVEL");
+    static char* level_name = std::getenv("TM_DEBUG_LEVEL");
    if (level_name != nullptr) {
        static std::string level = std::string(level_name);
        if (level == "DEBUG") {

--- a/src/turbomind/utils/debug_utils.h
+++ b/src/turbomind/utils/debug_utils.h
+#pragma once
+
+#if __has_include("3rdparty/dbg.h")
+#include "3rdparty/dbg.h"
+#else
+#define dbg(...)
+#endif