check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/models/llama/LlamaBatch.cc
+++ b/src/fastertransformer/models/llama/LlamaBatch.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/logger.h"
+#include <cstdint>
+#include <iomanip>
+#include <sstream>
+#include <unordered_map>
+namespace fastertransformer {
+template<typename T>
+void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
+                                   std::vector<std::shared_ptr<Request>>& infer_reqs)
+{
+    std::unordered_map<uint64_t, int> occurance;
+    auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) {
+        for (const auto& r : rs) {
+            ++occurance[r->id];
+        }
+    };
+    auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
+        FT_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
+        req->signal.set_value(ec);
+        req.reset();
+    };
+    auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
+                                                                      const char*                            type) {
+        for (auto& r : rs) {
+            if (r) {
+                int ec = 0;
+                if (occurance[r->id] != 1) {
+                    ec = Request::kConflict;
+                }
+                else if (r->start_flag && r->stop_flag) {
+                    ec = Request::kInvalid;
+                }
+                else if (!r->start_flag && !llama_->kv_cache_mgr_->contains(r->id)) {
+                    ec = Request::kInvalid;
+                }
+                if (ec) {
+                    invalidate(type, r, ec);
+                }
+            }
+        }
+    };
+    auto drop_invalid = [](std::vector<std::shared_ptr<Request>>& rs) {
+        int count = 0;
+        for (int i = 0; i < rs.size(); ++i) {
+            if (rs[i]) {
+                rs[count++] = std::move(rs[i]);
+            }
+        }
+        rs.resize(count);
+    };
+    count_occurance(stop_reqs);
+    count_occurance(infer_reqs);
+    if (!stop_reqs.empty()) {
+        handle_conflict_or_invalid(stop_reqs, "stop");
+        // invalidate stop-only requests for inactive sequences
+        for (auto& r : stop_reqs) {
+            if (r && r->end_flag == false) {
+                int ec = Request::kInactive;
+                for (int i = 0; i < batch_size_; ++i) {
+                    if (requests_[i] && requests_[i]->id == r->id) {
+                        ec = 0;
+                        break;
+                    }
+                }
+                if (ec) {
+                    invalidate("stop", r, ec);
+                }
+            }
+        }
+        drop_invalid(stop_reqs);
+    }
+    if (!infer_reqs.empty()) {
+        handle_conflict_or_invalid(infer_reqs, "infer");
+        // invalidate requests for busy sequences
+        for (auto& r : infer_reqs) {
+            if (r) {
+                for (int i = 0; i < batch_size_; ++i) {
+                    if (requests_[i] && requests_[i]->id == r->id) {
+                        invalidate("infer", r, Request::kBusy);
+                        break;
+                    }
+                }
+            }
+        }
+        drop_invalid(infer_reqs);
+    }
+}
+template<typename T>
+void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests)
+{
+    for (const auto& r : requests) {
+        int ec = Request::kFail;
+        // find matching active sequence
+        for (int i = 0; i < batch_size_; ++i) {
+            // stop & optionally erase active sequence
+            if (requests_[i] && requests_[i]->id == r->id) {
+                ec = 0;
+                finishRequest(i, r->end_flag);
+                break;
+            }
+        }
+        // mismatch, try erase inactive sequence
+        if (ec && r->end_flag) {
+            ec = 0;
+            llama_->kv_cache_mgr_->erase(r->id);
+        }
+        // clear output buffers (prevent leaking conversations) if request is successfull
+        if (ec == 0) {
+            auto& output_ids      = r->outputs[rank_].at("output_ids");
+            auto& sequence_length = r->outputs[rank_].at("sequence_length");
+            check_cuda_error(
+                cudaMemsetAsync(output_ids.getPtr<int>(), 0, sizeof(int) * output_ids.shape.at(2), stream_));
+            check_cuda_error(cudaMemsetAsync(sequence_length.getPtr<int>(), 0, sizeof(int), stream_));
+            check_cuda_error(cudaStreamSynchronize(stream_));
+        }
+        if (rank_ == 0) {
+            r->signal.set_value(ec);
+        }
+    }
+}
+template<typename T>
+void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const size_t batchxbeam = batch_size;
+    const size_t hidden_units = llama_->hidden_units_;
+    const size_t vocab_size   = llama_->vocab_size_;
+    context_decoder_input_buf_ =
+        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
+    context_decoder_ids_buf_ =
+        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);
+    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
+    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);
+    input_ids_buf_      = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
+    input_length_buf_   = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
+    history_length_buf_ = (int*)allocator_->reMalloc(history_length_buf_, sizeof(int) * batchxbeam);
+    context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);
+    total_padding_count_ = (int*)allocator_->reMalloc(total_padding_count_, sizeof(int) * batchxbeam, false);
+    sequence_lengths_    = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
+    k_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(k_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
+    v_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(v_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
+    logits_buf_       = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
+    local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
+    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);
+    end_ids_buf_   = (int*)allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false);
+    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
+    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size)
+{
+    output_ids_buf_ = (int*)allocator_->reMalloc(output_ids_buf_, sizeof(int) * max_batch_size * session_len_, true);
+    stop_words_buf_ =
+        (int*)allocator_->reMalloc(stop_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
+    bad_words_buf_ =
+        (int*)allocator_->reMalloc(bad_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
+    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
+    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
+    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
+    h_repetition_penalty_ =
+        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);
+    h_random_seed_ = (uint64_t*)allocator_->reMalloc(h_random_seed_, sizeof(uint64_t) * max_batch_size, true, true);
+    sampling_params_ = {{"stop_words_list", stop_words_buf_},
+                        {"bad_words_list", bad_words_buf_},
+                        {"runtime_top_k", h_runtime_top_k_},
+                        {"runtime_top_p", h_runtime_top_p_},
+                        {"temperature", h_temperature_},
+                        {"repetition_penalty", h_repetition_penalty_},
+                        {"random_seed", h_random_seed_}};
+    topk_curandstate_buf_ = allocator_->reMalloc(topk_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
+    topp_curandstate_buf_ = allocator_->reMalloc(topp_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
+    {
+        NcclGuard barrier(llama_->tensor_para_, stream_, true);
+        h_input_ids_buf_ =
+            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
+        h_input_length_buf_ =
+            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_history_length_buf_ =
+            (int*)allocator_->reMalloc(h_history_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_context_length_buf_ =
+            (int*)allocator_->reMalloc(h_context_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_sequence_lengths_ =
+            (int*)allocator_->reMalloc(h_sequence_lengths_, sizeof(int) * max_batch_size, false, true);
+        h_k_cache_ptr_buf_ =
+            (uintptr_t*)allocator_->reMalloc(h_k_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
+        h_v_cache_ptr_buf_ =
+            (uintptr_t*)allocator_->reMalloc(h_v_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
+        h_finished_buf_ = (bool*)allocator_->reMalloc(h_finished_buf_, sizeof(bool) * max_batch_size, false, true);
+        h_seq_limit_len_ =
+            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
+    }
+    is_allocate_persistant_buffer_ = true;
+}
+template<typename T>
+void LlamaBatch<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&context_decoder_input_buf_);
+        allocator_->free((void**)&context_decoder_ids_buf_);
+        allocator_->free((void**)&decoder_input_buf_);
+        allocator_->free((void**)&decoder_output_buf_);
+        allocator_->free((void**)&input_ids_buf_);
+        allocator_->free((void**)&input_length_buf_);
+        allocator_->free((void**)&history_length_buf_);
+        allocator_->free((void**)&context_length_buf_);
+        allocator_->free((void**)&total_padding_count_);
+        allocator_->free((void**)&sequence_lengths_);
+        allocator_->free((void**)&k_cache_ptr_buf_);
+        allocator_->free((void**)&v_cache_ptr_buf_);
+        allocator_->free((void**)&logits_buf_);
+        allocator_->free((void**)&local_logits_buf_);
+        allocator_->free((void**)&token_ids_buf_);
+        allocator_->free((void**)&end_ids_buf_);
+        allocator_->free((void**)&finished_buf_);
+        allocator_->free((void**)&seq_limit_len_);
+        is_allocate_buffer_ = false;
+    }
+    if (is_allocate_persistant_buffer_) {
+        allocator_->free((void**)&h_input_ids_buf_, true);
+        allocator_->free((void**)&h_input_length_buf_, true);
+        allocator_->free((void**)&h_history_length_buf_, true);
+        allocator_->free((void**)&h_context_length_buf_, true);
+        allocator_->free((void**)&h_sequence_lengths_, true);
+        allocator_->free((void**)&h_k_cache_ptr_buf_, true);
+        allocator_->free((void**)&h_v_cache_ptr_buf_, true);
+        allocator_->free((void**)&h_seq_limit_len_, true);
+        allocator_->free((void**)&h_finished_buf_, true);
+        allocator_->free((void**)&output_ids_buf_);
+        is_allocate_persistant_buffer_ = false;
+    }
+}
+template<typename T>
+LlamaBatch<T>::LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama):
+    max_batch_size_(max_batch_size),
+    max_context_token_num_(max_context_token_num),
+    session_len_(session_len),
+    rank_(llama->tensor_para_.rank_),
+    debug_(llama->debug_),
+    llama_(llama),
+    data_type_(getTensorType<T>())
+{
+    stream_         = llama_->stream_;
+    allocator_      = llama_->allocator_;
+    cublas_wrapper_ = llama_->cublas_wrapper_;
+    requests_.resize(max_batch_size);
+    request_seq_len_limit_.resize(max_batch_size);
+    cached_seq_.resize(max_batch_size);
+    allocatePersistantBuffer(max_batch_size);
+}
+template<typename T>
+void LlamaBatch<T>::initializeSampling(int infer_request_count)
+{
+    TensorMap inputs;
+    for (const auto& param : sampling_params_) {
+        const Tensor* ptr{};
+        for (int i = 0; i < batch_size_; ++i) {
+            if (requests_[i]->inputs[rank_].isExist(param.first)) {
+                ptr = &requests_[i]->inputs[rank_].at(param.first);
+                break;
+            }
+        }
+        if (ptr) {
+            const auto& ref   = *ptr;
+            auto        shape = ref.shape;
+            FT_CHECK(shape[0] == 1);
+            shape[0]                = batch_size_;
+            const int size_in_bytes = ref.sizeBytes();
+            check_cuda_error(cudaMemsetAsync(param.second, 0, size_in_bytes * batch_size_, stream_));
+            for (int i = 0; i < batch_size_; ++i) {
+                if (requests_[i]->inputs[rank_].isExist(param.first)) {
+                    auto& src = requests_[i]->inputs[rank_].at(param.first);
+                    FT_CHECK(ref.shape == src.shape);
+                    check_cuda_error(cudaMemcpyAsync((uint8_t*)param.second + size_in_bytes * i,
+                                                     src.getPtr<void>(),
+                                                     size_in_bytes,
+                                                     cudaMemcpyDefault,
+                                                     stream_));
+                }
+            }
+            inputs.insert({param.first, {ref.where, ref.type, shape, param.second}});
+            if (debug_ && rank_ == 0) {
+                FT_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
+            }
+        }
+    }
+    inputs_ = std::move(inputs);
+    llama_->dynamic_decode_layer_->setup(batch_size_, 1, &inputs_);
+    for (int i = 0; i < batch_size_; ++i) {
+        // recover random states if not a new request or new request w/o "random_seed"
+        if (i < batch_size_ - infer_request_count || !requests_[i]->inputs[rank_].isExist("random_seed")) {
+            check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
+                                             (curandState_t*)topk_curandstate_buf_ + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
+                                             (curandState_t*)topp_curandstate_buf_ + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+    handleOptArg(&inputs_, "end_id", end_ids_buf_, llama_->end_id_, batch_size_);
+    cudaStreamSynchronize(0);
+}
+template<typename T>
+void LlamaBatch<T>::initializeGeneration()
+{
+    max_context_len_ = *std::max_element(h_context_length_buf_, h_context_length_buf_ + batch_size_);
+    check_cuda_error(cudaMemsetAsync(token_ids_buf_, 0, sizeof(int) * batch_size_ * session_len_ * 2, stream_));
+    invokeTransposeAxis01(token_ids_buf_, output_ids_buf_, batch_size_, session_len_, 1, stream_);
+    sync_check_cuda_error();
+    // token_ids_buf_[s, b]
+    // ABCDe            ABCDe     e
+    // ABCDEFGHIJk      ABCDEFGHIJk
+    // ABCDEFGHi    ->  ABCDEFGHi i
+    // ABCDEFGh         ABCDEFGh  h
+    // ABCd             ABCd      d
+    for (int i = 0; i < batch_size_; ++i) {
+        auto token_ids = token_ids_buf_ + i;
+        auto p_src     = h_context_length_buf_[i] - 1;
+        auto p_dst     = max_context_len_ - 1;
+        if (p_src != p_dst) {  // dst and src of `cudaMemcpyAsync` must not overlap
+            check_cuda_error(cudaMemcpyAsync(token_ids + p_dst * batch_size_,
+                                             token_ids + p_src * batch_size_,
+                                             sizeof(int),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+    check_cuda_error(cudaMemcpyAsync(
+        context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(
+        cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    // `sequence_lengths_` will be increased by dynamic decode
+    // note that in decoder and in output "sequence length" has differnt semantic
+    // - in decoder it means length of sequence that has kv cache already computed
+    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
+    invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
+    sync_check_cuda_error();
+    // total_padding_count_
+    // decoding starts at max_context_len
+    check_cuda_error(cudaMemsetAsync(total_padding_count_, 0, sizeof(int) * batch_size_, stream_));
+    invokeUpdatePaddingCount(total_padding_count_,  //
+                             context_length_buf_,
+                             max_context_len_,
+                             batch_size_,
+                             1,
+                             stream_);
+    sync_check_cuda_error();
+    // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted for
+    for (int i = 0; i < batch_size_; ++i) {
+        h_seq_limit_len_[i] = request_seq_len_limit_[i] + (max_context_len_ - h_context_length_buf_[i]);
+        // mask finished sequences
+        h_finished_buf_[i] = max_context_len_ >= h_seq_limit_len_[i];
+    }
+    check_cuda_error(
+        cudaMemcpyAsync(seq_limit_len_, h_seq_limit_len_, sizeof(uint32_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(
+        cudaMemcpyAsync(finished_buf_, h_finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
+    // ! range of step_ [1, 2 * session_len]
+    // consider a sequence with context_len == session_len and another sequence with context_len == 1 and
+    // request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
+    step_ = max_context_len_;
+    if (rank_ == 0) {
+        FT_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
+        FT_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
+        FT_LOG_INFO("[initGen] slot  sequence_id  context_len  seq_limit_len  finished");
+        for (int i = 0; i < batch_size_; ++i) {
+            FT_LOG_INFO("[initGen] %4d  %11ld  %11d  %13d  %8d",
+                        i,
+                        (long)cached_seq_[i].id,
+                        h_context_length_buf_[i],
+                        (int)h_seq_limit_len_[i],
+                        (int)h_finished_buf_[i]);
+        }
+    }
+}
+template<typename T>
+bool LlamaBatch<T>::generate()
+{
+    constexpr int kLogInterval = 10;
+    if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) {
+        FT_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
+    }
+    const bool is_first_step = step_ == max_context_len_;
+    std::vector<int> prev;
+    if (debug_ && rank_ == 0 && is_first_step) {
+        prev.resize(batch_size_);
+        cudaMemcpyAsync(prev.data(),
+                        token_ids_buf_ + (step_ - 1) * batch_size_,
+                        sizeof(int) * batch_size_,
+                        cudaMemcpyDefault,
+                        stream_);
+    }
+    // embeddingLookup(step_ - 1);
+    llama_->embeddingLookup(decoder_input_buf_,  //
+                            token_ids_buf_,
+                            batch_size_,
+                            step_ - 1);
+    llama_->decoderForward(decoder_output_buf_,
+                           k_cache_ptr_buf_,
+                           v_cache_ptr_buf_,
+                           decoder_input_buf_,
+                           sequence_lengths_,
+                           total_padding_count_,
+                           finished_buf_,
+                           step_,
+                           0,
+                           session_len_,
+                           batch_size_);
+    llama_->postDecodeEmbedding(logits_buf_,  //
+                                local_logits_buf_,
+                                decoder_output_buf_,
+                                batch_size_);
+    // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
+    // not supported yet.
+    bool should_stop{};
+    llama_->dynamicDecode(token_ids_buf_,
+                          finished_buf_,
+                          sequence_lengths_,
+                          &should_stop,
+                          &inputs_,
+                          &outputs_,
+                          logits_buf_,
+                          seq_limit_len_,
+                          context_length_buf_,
+                          end_ids_buf_,
+                          step_,
+                          0,
+                          max_context_len_,
+                          session_len_ * 2,
+                          batch_size_);
+    if (debug_ && rank_ == 0) {
+        std::vector<int> curr(batch_size_);
+        cudaMemcpyAsync(
+            curr.data(), token_ids_buf_ + step_ * batch_size_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_);
+        cudaStreamSynchronize(stream_);
+        if (is_first_step) {
+            std::stringstream sprev;
+            for (int k = 0; k < prev.size(); ++k) {
+                sprev << std::setw(6) << prev[k];
+            }
+            FT_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
+        }
+        std::stringstream scurr;
+        for (int k = 0; k < curr.size(); ++k) {
+            scurr << std::setw(6) << curr[k];
+        }
+        FT_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
+    }
+    ////////////////////////////////////////////////
+    /// ! increase the step counter
+    ++step_;
+    return !should_stop;
+}
+template<typename T>
+void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infer_requests)
+{
+    FT_CHECK(batch_size_ + infer_requests.size() <= max_batch_size_);
+    const int infer_request_count = infer_requests.size();
+    allocateBuffer(batch_size_ + infer_request_count, session_len_);
+    // handle infer requests
+    std::vector<int>       tmp_input_length(infer_request_count);
+    std::vector<CachedSeq> tmp_cached_seq;
+    tmp_cached_seq.reserve(infer_request_count);
+    int tmp_max_input_length = 0;
+    for (int i = 0; i < infer_request_count; ++i) {
+        auto& r = *infer_requests[i];
+        LlamaCacheManager::Sequence seq{};
+        if (r.start_flag) {
+            seq = llama_->kv_cache_mgr_->create(r.id, stream_);
+        }
+        else {
+            seq = llama_->kv_cache_mgr_->fetch(r.id, stream_);
+        }
+        const int step = r.inputs[rank_].getVal<int>("step", -1);
+        if (step >= 0) {
+            if (step <= seq.token_ids.size()) {
+                seq.token_ids.resize(step);
+                seq.cache_len = std::min(seq.cache_len, (size_t)step);
+            }
+            else if (rank_ == 0) {
+                FT_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
+            }
+        }
+        // input length with missing cache accounted for
+        int actual_input_len = r.inputs[rank_].getVal<int>("input_lengths") + (seq.token_ids.size() - seq.cache_len);
+        // insert `start_id` for empty sequences
+        if (seq.token_ids.empty() && actual_input_len == 0) {
+            seq.token_ids.push_back(llama_->start_id_);
+            seq.cache_len    = 0;
+            actual_input_len = seq.token_ids.size() - seq.cache_len;
+        }
+        tmp_input_length[i] = actual_input_len;
+        tmp_max_input_length = std::max((int)tmp_max_input_length, actual_input_len);
+        tmp_cached_seq.push_back(std::move(seq));
+    }
+    FT_CHECK(tmp_max_input_length > 0);
+    const int max_input_length = tmp_max_input_length;
+    // arrange requests in ascending order w.r.t actual input lengths, so that requests need context decoding will
+    // be together
+    {
+        std::vector<int> idxs(tmp_input_length.size());
+        std::iota(idxs.begin(), idxs.end(), 0);
+        std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return tmp_input_length[i] < tmp_input_length[j]; });
+        for (int i = 0; i < idxs.size(); ++i) {
+            requests_[batch_size_ + i]   = infer_requests[idxs[i]];
+            cached_seq_[batch_size_ + i] = tmp_cached_seq[idxs[i]];
+        }
+    }
+    const int count = batch_size_ + infer_requests.size();
+    std::vector<int> tmp_input_len(count);
+    for (int i = batch_size_; i < count; ++i) {
+        const auto& seq = cached_seq_[i];
+        h_input_length_buf_[i] = requests_[i]->inputs[rank_].getVal<int>("input_lengths");
+        tmp_input_len[i]       = h_input_length_buf_[i];
+        // prepare output ids
+        // <--------> max_context_len
+        // aaaAAAA
+        // bbbbBBBBBB
+        // ccCCC
+        auto output_ids_ptr = output_ids_buf_ + i * session_len_;
+        // clear the persistent buffer to prevent leaking previous conversation
+        check_cuda_error(cudaMemsetAsync(output_ids_ptr, 0, sizeof(int) * session_len_, stream_));
+        if (!seq.token_ids.empty()) {
+            check_cuda_error(cudaMemcpyAsync(output_ids_ptr,  //
+                                             seq.token_ids.data(),
+                                             sizeof(int) * seq.token_ids.size(),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            output_ids_ptr += seq.token_ids.size();
+        }
+        if (h_input_length_buf_[i]) {
+            auto input_ids_ptr = requests_[i]->inputs[rank_].getPtr<int>("input_ids");
+            check_cuda_error(cudaMemcpyAsync(output_ids_ptr,  //
+                                             input_ids_ptr,
+                                             sizeof(int) * h_input_length_buf_[i],
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+        if (!requests_[i]->start_flag && !seq.random_state_.empty()) {
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + i,
+                                             seq.random_state_.data(),
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + i,
+                                             seq.random_state_.data() + sizeof(curandState_t),
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+    for (int i = batch_size_; i < count; ++i) {
+        const auto& seq           = cached_seq_[i];
+        const int   missed        = (int)seq.token_ids.size() - seq.cache_len;
+        auto        input_ids_buf = input_ids_buf_ + i * session_len_;
+        FT_CHECK(missed >= 0);
+        if (missed > 0) {
+            check_cuda_error(cudaMemcpyAsync(input_ids_buf,  //
+                                             seq.token_ids.data() + seq.cache_len,
+                                             sizeof(int) * missed,
+                                             cudaMemcpyDefault,
+                                             stream_));
+            input_ids_buf += missed;
+        }
+        auto& input_ids = requests_[i]->inputs[rank_].at("input_ids");
+        check_cuda_error(cudaMemcpyAsync(input_ids_buf,  //
+                                         input_ids.getPtr<int>(),
+                                         sizeof(int) * h_input_length_buf_[i],
+                                         cudaMemcpyDefault,
+                                         stream_));
+        h_input_length_buf_[i] += missed;
+        h_history_length_buf_[i] = seq.cache_len;
+        h_context_length_buf_[i] = h_input_length_buf_[i] + h_history_length_buf_[i];
+        const int request_output_len = requests_[i]->inputs[rank_].getVal<int>("request_output_len");
+        request_seq_len_limit_[i]    = h_context_length_buf_[i] + request_output_len;
+        // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
+        // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
+        if (request_seq_len_limit_[i] >= session_len_) {
+            request_seq_len_limit_[i] = session_len_ - 1;
+            if (rank_ == 0) {
+                const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i];
+                FT_LOG_WARNING(
+                    "[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d",
+                    (long)seq.id,
+                    h_context_length_buf_[i],
+                    request_output_len,
+                    (int)session_len_,
+                    trunc_output_len);
+            }
+        }
+        h_k_cache_ptr_buf_[i] = (uint64_t)seq.k_cache;
+        h_v_cache_ptr_buf_[i] = (uint64_t)seq.v_cache;
+    }
+    const int max_context_len = *std::max_element(h_context_length_buf_ + batch_size_, h_context_length_buf_ + count);
+    batch_size_      = count;
+    max_context_len_ = max_context_len;
+    step_            = max_context_len;
+    check_cuda_error(
+        cudaMemcpyAsync(input_length_buf_, h_input_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        history_length_buf_, h_history_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    if (llama_->tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
+        FT_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
+        FT_LOG_INFO("[init] session_len = %d", (int)session_len_);
+        FT_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
+        FT_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
+        FT_LOG_INFO(
+            "[init] slot  sequence_id  history_len  input_len  context_len  tmp_input_len  token_ids.size  cache_len");
+        for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) {
+            FT_LOG_INFO("[init] %4d  %11ld  %11d  %9d  %11d  %13d  %14d  %9d",
+                        i,
+                        (int)cached_seq_[i].id,
+                        h_history_length_buf_[i],
+                        h_input_length_buf_[i],
+                        h_context_length_buf_[i],
+                        tmp_input_len[i],
+                        (int)cached_seq_[i].token_ids.size(),
+                        (int)cached_seq_[i].cache_len);
+        }
+    }
+}
+template<typename T>
+void LlamaBatch<T>::contextDecode()
+{
+    int base = -1;
+    for (int i = 0; i < batch_size_; ++i) {
+        if (h_input_length_buf_[i] > 1) {
+            base = i;
+            break;
+        }
+    }
+    if (base >= 0) {
+        check_cuda_error(cudaStreamSynchronize(stream_));
+        const auto tick = std::chrono::high_resolution_clock::now();
+        const int context_decode_count = batch_size_ - base;
+        if (rank_ == 0) {
+            FT_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
+        }
+        invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_);
+        invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);
+        auto get_input_len   = [this](int index) { return h_input_length_buf_[index] - 1; };
+        auto get_context_len = [this](int index) { return h_context_length_buf_[index] - 1; };
+        auto token_num       = get_input_len(base);
+        auto max_input_len   = get_input_len(base);
+        auto max_context_len = get_context_len(base);
+        auto offset          = base;
+        for (int i = offset + 1; i <= batch_size_; ++i) {
+            if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) {
+                const int context_decode_batch_size = i - offset;
+                if (rank_ == 0) {
+                    FT_LOG_INFO(
+                        "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
+                        base,
+                        context_decode_batch_size,
+                        token_num,
+                        max_input_len,
+                        max_context_len);
+                }
+                // construct context_decoder_ids w/o padding
+                // aaaa____
+                // bb______ -> aaaabbcccccccc
+                // cccccccc
+                auto context_decoder_ids = context_decoder_ids_buf_;
+                for (int j = offset; j < i; ++j) {
+                    check_cuda_error(cudaMemcpyAsync(context_decoder_ids,
+                                                     input_ids_buf_ + j * session_len_,
+                                                     sizeof(int) * get_input_len(j),
+                                                     cudaMemcpyDefault,
+                                                     stream_));
+                    context_decoder_ids += get_input_len(j);
+                }
+                llama_->contextDecode(nullptr,
+                                      k_cache_ptr_buf_ + offset,
+                                      v_cache_ptr_buf_ + offset,
+                                      context_decoder_input_buf_,
+                                      nullptr,
+                                      context_decoder_ids_buf_,
+                                      input_length_buf_ + offset,
+                                      history_length_buf_ + offset,
+                                      context_length_buf_ + offset,
+                                      token_num,
+                                      max_input_len,
+                                      max_context_len,
+                                      session_len_,
+                                      context_decode_batch_size);
+                if (i < batch_size_) {
+                    token_num       = get_input_len(i);
+                    max_input_len   = get_input_len(i);
+                    max_context_len = get_context_len(i);
+                    offset          = i;
+                }
+            }
+            else {
+                token_num += get_input_len(i);
+                max_input_len   = std::max(max_input_len, get_input_len(i));
+                max_context_len = std::max(max_context_len, get_context_len(i));
+            }
+        }
+        invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);
+        invokePlusScalar(input_length_buf_ + base, 1, context_decode_count, stream_);
+        for (int i = offset; i < batch_size_; ++i) {
+            h_input_length_buf_[i] = 0;
+        }
+        check_cuda_error(cudaStreamSynchronize(stream_));
+        const auto tock = std::chrono::high_resolution_clock::now();
+        if (rank_ == 0) {
+            FT_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
+        }
+    }
+    else if (rank_ == 0) {
+        FT_LOG_INFO("[decodeContext] Context decoding is not needed.");
+    }
+}
+template<typename T>
+void LlamaBatch<T>::finish()
+{
+    // secure info needed by `synchronize()`
+    check_cuda_error(
+        cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(
+        cudaMemcpyAsync(h_sequence_lengths_, sequence_lengths_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    setOutputTensors(step_);
+    check_cuda_error(cudaStreamSynchronize(stream_));
+    for (int i = 0; i < batch_size_; ++i) {
+        FT_CHECK(requests_[i] != nullptr);
+        if (requests_[i]->stream_cb && rank_ == 0) {
+            requests_[i]->stream_cb(&requests_[i]->outputs[rank_].get());
+        }
+    }
+    if (debug_ && rank_ == 0) {
+        std::stringstream ss;
+        for (int i = 0; i < batch_size_; ++i) {
+            ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")";
+        }
+        FT_LOG_INFO("[finish] [%s]", ss.str().c_str());
+    }
+    for (int i = 0; i < batch_size_; ++i) {
+        if (h_finished_buf_[i]) {
+            finishRequest(i, false);
+            ++finished_count_;
+        }
+    }
+}
+template<typename T>
+void LlamaBatch<T>::synchronize()
+{
+    // compact
+    int idx = 0;
+    for (int i = 0; i < batch_size_; ++i) {
+        if (requests_[i]) {
+            h_input_length_buf_[idx]   = 0;
+            h_history_length_buf_[idx] = 0;
+            h_context_length_buf_[idx] = h_sequence_lengths_[i] + 1;
+            h_sequence_lengths_[idx]   = h_context_length_buf_[idx];
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + idx,
+                                             llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + idx,
+                                             llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            if (i != idx) {
+                h_finished_buf_[idx]        = h_finished_buf_[i];
+                request_seq_len_limit_[idx] = request_seq_len_limit_[i];
+                h_k_cache_ptr_buf_[idx] = h_k_cache_ptr_buf_[i];
+                h_v_cache_ptr_buf_[idx] = h_v_cache_ptr_buf_[i];
+                requests_[idx]   = std::move(requests_[i]);
+                cached_seq_[idx] = std::move(cached_seq_[i]);
+                check_cuda_error(cudaMemcpyAsync(output_ids_buf_ + idx * session_len_,
+                                                 output_ids_buf_ + i * session_len_,
+                                                 sizeof(int) * h_context_length_buf_[idx],
+                                                 cudaMemcpyDefault,
+                                                 stream_));
+            }
+            ++idx;
+        }
+    }
+    batch_size_ = idx;
+    if (rank_ == 0) {
+        FT_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
+    }
+    finished_count_ = 0;
+}
+template<typename T>
+void LlamaBatch<T>::setOutputTensors(int max_gen_step)
+{
+    // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
+    invokeGatherOutput(output_ids_buf_,
+                       token_ids_buf_,
+                       context_length_buf_,
+                       max_context_len_,
+                       max_gen_step,
+                       session_len_,
+                       batch_size_,
+                       stream_);
+    sync_check_cuda_error();
+    /// TODO: fuse the loop into a single kernel
+    for (int i = 0; i < batch_size_; ++i) {
+        if (requests_[i]) {
+            auto& output_ids      = requests_[i]->outputs[rank_].at("output_ids");
+            auto& sequence_length = requests_[i]->outputs[rank_].at("sequence_length");
+            check_cuda_error(cudaMemcpyAsync(output_ids.getPtr<int>(),
+                                             output_ids_buf_ + i * session_len_,
+                                             sizeof(int) * output_ids.shape.at(2),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync(
+                sequence_length.getPtr<int>(), sequence_lengths_ + i, sizeof(int), cudaMemcpyDefault, stream_));
+            if (max_gen_step > max_context_len_) {  // +1 for newly generated token
+                invokePlusScalar(sequence_length.getPtr<int>(), 1, 1, stream_);
+            }
+        }
+    }
+}
+template<typename T>
+void LlamaBatch<T>::finishRequest(int index, bool force_end)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
+    }
+    if (debug_ && rank_ == 0) {
+        std::vector<int> tokens(h_sequence_lengths_[index] + 1);
+        cudaMemcpyAsync(tokens.data(),
+                        output_ids_buf_ + index * session_len_,
+                        sizeof(int) * tokens.size(),
+                        cudaMemcpyDefault,
+                        stream_);
+        cudaStreamSynchronize(stream_);
+        std::stringstream ss;
+        for (const auto& t : tokens) {
+            ss << " " << t;
+        }
+        FT_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
+    }
+    auto&      output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids");
+    const auto output_ids_data   = output_ids_tensor.getPtr<int>();
+    if (requests_[index]->end_flag || force_end) {
+        llama_->kv_cache_mgr_->erase(requests_[index]->id);
+    }
+    else {
+        // the last generated token is not processed by decoder thus dont have k/v cache
+        const int n_steps    = step_ - max_context_len_;
+        const int cache_len  = h_sequence_lengths_[index];
+        const int output_len = n_steps > 0 ? cache_len + 1 : cache_len;
+        auto& seq = cached_seq_[index];
+        seq.cache_len = cache_len;
+        // update token IDs
+        seq.token_ids.resize(output_len);
+        check_cuda_error(cudaMemcpyAsync(
+            seq.token_ids.data(), output_ids_data, sizeof(int) * output_len, cudaMemcpyDefault, stream_));
+        // update random states
+        seq.random_state_.resize(sizeof(curandState_t) * 2);
+        check_cuda_error(cudaMemcpyAsync(seq.random_state_.data(),
+                                         llama_->dynamic_decode_layer_->topk_curandstate_buf() + index,
+                                         sizeof(curandState_t),
+                                         cudaMemcpyDefault,
+                                         stream_));
+        check_cuda_error(cudaMemcpyAsync(seq.random_state_.data() + sizeof(curandState_t),
+                                         llama_->dynamic_decode_layer_->topp_curandstate_buf() + index,
+                                         sizeof(curandState_t),
+                                         cudaMemcpyDefault,
+                                         stream_));
+        check_cuda_error(cudaStreamSynchronize(stream_));
+        llama_->kv_cache_mgr_->update(cached_seq_[index], stream_);
+    }
+    if (rank_ == 0) {
+        requests_[index]->signal.set_value(0);
+    }
+    requests_[index] = nullptr;
+}
+template class LlamaBatch<half>;
+template class LlamaBatch<float>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaBatch.h
+++ b/src/fastertransformer/models/llama/LlamaBatch.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaV2;
+template<typename T>
+class LlamaBatch {
+public:
+    int size() const noexcept
+    {
+        return batch_size_;
+    };
+    int maxSize() const noexcept
+    {
+        return max_batch_size_;
+    }
+    int finishedCount() const noexcept
+    {
+        return finished_count_;
+    }
+    void verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
+                        std::vector<std::shared_ptr<Request>>& infer_reqs);
+    void handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests);
+    void allocateBuffer(size_t batch_size, size_t session_len);
+    void allocatePersistantBuffer(size_t max_batch_size);
+    void freeBuffer();
+    void initializeSampling(int infer_request_count);
+    void initialize(const std::vector<std::shared_ptr<Request>>& infer_requests);
+    void contextDecode();
+    void initializeGeneration();
+    bool generate();
+    void finish();
+    void finishRequest(int index, bool force_end);
+    void synchronize();
+    void setOutputTensors(int max_gen_step);
+    explicit LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama);
+    ~LlamaBatch()
+    {
+        freeBuffer();
+    }
+private:
+    const int  max_batch_size_;
+    const int  max_context_token_num_;
+    const int  session_len_;
+    const int  rank_;
+    const bool debug_;
+    LlamaV2<T>* const llama_;
+    // active requests
+    std::vector<std::shared_ptr<Request>> requests_;
+    T* context_decoder_input_buf_{};  // CTXDEC
+    // T* context_decoder_output_buf_{};  // CTXDEC
+    int* context_decoder_ids_buf_{};
+    T* decoder_input_buf_{};   // CTXDEC, GENERATE
+    T* decoder_output_buf_{};  // CTXDEC, GENERATE
+    int* input_ids_buf_{};       // input token ids + cache missed token ids, CTXDEC
+    int* input_length_buf_{};    // input + cache missed length, CTXDEC, GENERATE
+    int* history_length_buf_{};  // history length, CTXDEC
+    int* context_length_buf_{};  // history length + input_length, CTXDEC, GENERATE
+    int* total_padding_count_{};  // GENERATE
+    int* sequence_lengths_{};     // current sequence length
+    uint64_t* k_cache_ptr_buf_{};
+    uint64_t* v_cache_ptr_buf_{};
+    float* logits_buf_{};        // combined logits
+    float* local_logits_buf_{};  // tensor parallel local logits
+    // used by dynamic decoder
+    int*      token_ids_buf_{};   // all token IDs in [S, B], indexed using `step`
+    int*      output_ids_buf_{};  // output ids in [B, S]
+    int*      end_ids_buf_{};
+    bool*     finished_buf_{};
+    uint32_t* seq_limit_len_{};
+    // pinned buffers
+    int*       h_input_ids_buf_{};
+    int*       h_input_length_buf_{};
+    int*       h_history_length_buf_{};
+    int*       h_context_length_buf_{};
+    int*       h_sequence_lengths_{};
+    bool*      h_finished_buf_{};
+    uintptr_t* h_k_cache_ptr_buf_{};
+    uintptr_t* h_v_cache_ptr_buf_{};
+    uint32_t*  h_seq_limit_len_{};
+    int*      stop_words_buf_{};  // [batch_size, 2, kMaxStopWordsLen]
+    int*      bad_words_buf_{};
+    int*      h_runtime_top_k_{};
+    float*    h_runtime_top_p_{};
+    float*    h_temperature_{};
+    float*    h_repetition_penalty_{};
+    uint64_t* h_random_seed_{};
+    void* topk_curandstate_buf_{};
+    void* topp_curandstate_buf_{};
+    // hard limits for persistant buffers
+    static constexpr int kMaxStopBadWordsLen = 32;
+    using CachedSeq = LlamaCacheManager::Sequence;
+    std::vector<CachedSeq> cached_seq_;
+    std::vector<int>       request_seq_len_limit_;
+    const DataType data_type_{};
+    int batch_size_{};
+    int max_context_len_{};
+    int step_{};
+    int finished_count_{};
+    bool is_allocate_persistant_buffer_ = false;
+    bool is_allocate_buffer_            = false;
+    TensorMap inputs_;
+    TensorMap outputs_;
+    std::unordered_map<std::string, void*> sampling_params_;
+    cudaStream_t     stream_{};
+    cublasMMWrapper* cublas_wrapper_{};
+    IAllocator*      allocator_{};
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaCacheManager.cc
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+namespace fastertransformer {
+LlamaCacheManager::~LlamaCacheManager()
+{
+    for (auto& p : device_mem_) {
+        allocator_->free(&p, false);
+    }
+}
+void* LlamaCacheManager::allocate(bool is_preallocte)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][allocate]");
+    }
+    void* mem_ptr{};
+    if (!device_free_.empty()) {
+        mem_ptr = device_free_.front();
+        device_free_.pop();
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+        }
+    }
+    else if (entry_count_ < max_entry_count_) {
+        const auto   alloc_count     = std::min(chunk_size_, max_entry_count_ - entry_count_);
+        const size_t entry_byte_size = 2 * cache_byte_size_;  // 2 for k,v
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
+        }
+        const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
+        FT_CHECK(chunk_ptr);
+        device_mem_.push_back(chunk_ptr);
+        entry_count_ += alloc_count;
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
+        }
+        for (int i = 0; i < alloc_count; ++i) {
+            device_free_.push((uint8_t*)chunk_ptr + entry_byte_size * i);
+        }
+        if (!is_preallocte) {
+            mem_ptr = device_free_.front();
+            device_free_.pop();
+        }
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+        }
+    }
+    else {
+        mem_ptr = evict();
+        FT_CHECK_WITH_INFO(mem_ptr, "No enough cache entries.");
+    }
+    return mem_ptr;
+}
+auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
+    }
+    for (const auto& e : device_cache_) {
+        if (e.id == id) {
+            if (rank_ == 0) {
+                FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
+            }
+            erase(id);
+        }
+    }
+    const auto mem_ptr = (uint8_t*)allocate(false);
+    check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
+    device_cache_.push_back({
+        id,
+        max_seq_len_,
+        {},
+        0,
+        mem_ptr,
+        mem_ptr + cache_byte_size_,
+        {},
+        static_cast<uint64_t>(-1),
+    });
+    return device_cache_.back();
+}
+auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::iterator
+{
+    auto pred = [&](const Sequence& s) { return s.id == id; };
+    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
+    if (it == device_cache_.end()) {
+        FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
+        FT_CHECK(0);
+    }
+    return it;
+}
+auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
+    }
+    auto entry = getEntryOrThrow(id);
+    if (entry->k_cache == nullptr) {
+        FT_CHECK(entry->cache_len == 0);
+        const auto mem_ptr = allocate(false);
+        check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
+        entry->k_cache = mem_ptr;
+        entry->v_cache = (uint8_t*)entry->k_cache + cache_byte_size_;
+    }
+    entry->timestamp = static_cast<uint64_t>(-1);
+    return *entry;
+}
+void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
+    }
+    auto entry = getEntryOrThrow(seq.id);
+    entry->timestamp = ++timestamp_;
+    entry->token_ids = seq.token_ids;
+    entry->cache_len = seq.cache_len;
+    FT_CHECK(seq.k_cache == entry->k_cache && seq.v_cache == entry->v_cache);
+}
+void LlamaCacheManager::erase(uint64_t id)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
+    }
+    auto entry = getEntryOrThrow(id);
+    if (entry->k_cache) {
+        device_free_.push(entry->k_cache);
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
+        }
+    }
+    device_cache_.erase(entry);
+}
+void* LlamaCacheManager::evict()
+{
+    FT_CHECK(!device_cache_.empty());
+    auto it = std::min_element(device_cache_.begin(), device_cache_.end(), [](const auto& a, const auto& b) {
+        return a.timestamp < b.timestamp;
+    });
+    if (it->timestamp == static_cast<uint64_t>(-1)) {
+        return nullptr;
+    }
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
+    }
+    FT_CHECK(it->k_cache);
+    auto mem_ptr = it->k_cache;
+    it->k_cache = it->v_cache = nullptr;
+    it->cache_len             = 0;
+    it->timestamp             = static_cast<uint64_t>(-1);
+    return mem_ptr;
+}
+bool LlamaCacheManager::contains(uint64_t id) const noexcept
+{
+    auto pred = [&](const Sequence& s) { return s.id == id; };
+    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
+    return it != device_cache_.end();
+}
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaCacheManager.h
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/logger.h"
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+namespace fastertransformer {
+// k-cache layout [L, H, D/x, S[s:], x]
+// v-cache layout [L, H, S[s:], D/x, x]
+class LlamaCacheManager {
+public:
+    LlamaCacheManager(size_t      layer_num,
+                      size_t      head_num,
+                      size_t      size_per_head,
+                      size_t      max_seq_len,
+                      size_t      elem_bits,
+                      size_t      max_entry_count,
+                      size_t      chunk_size,
+                      int         rank,
+                      IAllocator* allocator):
+        layer_num_(layer_num),
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        max_seq_len_(max_seq_len),
+        elem_bits_(elem_bits),
+        cache_byte_size_(layer_num_ * head_num_ * max_seq_len_ * size_per_head_ * elem_bits_ / 8),
+        max_entry_count_(max_entry_count),
+        chunk_size_(chunk_size),
+        rank_(rank),
+        allocator_(allocator)
+    {
+        if (rank == 0) {
+            FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
+            FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
+        }
+        allocate(true);
+    }
+    ~LlamaCacheManager();
+    struct Sequence {
+        // header
+        uint64_t id;
+        size_t   max_seq_len;
+        // payloads
+        std::vector<int> token_ids;  // all token ids
+        size_t           cache_len;  // cache_len == 0 -> cache miss
+        void*            k_cache;
+        void*            v_cache;
+        std::vector<uint8_t> random_state_;  // states for RNGs
+        // for LRU policy
+        uint64_t timestamp;
+    };
+    Sequence create(uint64_t id, cudaStream_t stream);
+    Sequence fetch(uint64_t id, cudaStream_t stream);
+    void update(const Sequence& seq, cudaStream_t stream);
+    void erase(uint64_t id);
+    bool contains(uint64_t id) const noexcept;
+private:
+    std::vector<Sequence>::iterator getEntryOrThrow(uint64_t id);
+    void* allocate(bool is_preallocte);
+    void* evict();
+private:
+    const size_t layer_num_{};
+    const size_t head_num_{};
+    const size_t size_per_head_{};
+    const size_t max_seq_len_{};
+    const size_t elem_bits_{};
+    const size_t cache_byte_size_{};
+    const size_t max_entry_count_{};
+    const size_t chunk_size_{};
+    const int    rank_{};
+    IAllocator*  allocator_{};
+    std::queue<void*>  device_free_;
+    std::vector<void*> device_mem_;
+    int                entry_count_{};
+    uint64_t timestamp_{};
+    std::vector<Sequence> device_cache_;
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
+#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+namespace fastertransformer {
+template<typename T>
+void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
+                                                   size_t num_token,
+                                                   size_t max_q_len,
+                                                   size_t max_k_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // no padding
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
+    // padding is rebuilt for q/k/v_buf_2_
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * 3 * batch_size * max_q_len * local_hidden_units_, true);
+    k_buf_2_ = q_buf_2_ + batch_size * max_q_len * local_hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * max_q_len * local_hidden_units_;
+    if (use_fmha_) {
+        FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
+        if (flash_attention.get_workspace_size() > 0) {
+            qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), true);
+        }
+    }
+    else {
+        k_cache_buf_ = (T*)allocator_->reMalloc(
+            k_cache_buf_, 2 * sizeof(T) * batch_size * local_head_num_ * max_k_len * size_per_head_, true);
+        v_cache_buf_ = k_cache_buf_ + batch_size * local_head_num_ * max_k_len * size_per_head_;
+        qk_buf_ =
+            (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * max_q_len * max_k_len, true);
+        // qkv_buf_2_ has padding
+        qkv_buf_2_ =
+            (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * max_q_len * local_hidden_units_, true);
+    }
+    // qkv_buf_3_ padding is removed
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_hidden_units_, true);
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaContextAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&q_buf_2_));
+        if (use_fmha_) {
+            allocator_->free((void**)&qk_buf_float_);
+        }
+        else {
+            allocator_->free((void**)(&k_cache_buf_));
+            allocator_->free((void**)(&qk_buf_));
+            allocator_->free((void**)(&qkv_buf_2_));
+        }
+        allocator_->free((void**)(&qkv_buf_3_));
+        is_allocate_buffer_ = false;
+    }
+}
+template<typename T>
+inline void LlamaContextAttentionLayer<T>::forward(TensorMap*                     output_tensors,
+                                                   const TensorMap*               input_tensors,
+                                                   const LlamaAttentionWeight<T>* weights)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    /**
+     * input_tensors:
+     *   \param input_query [token_num, hidden_dim]
+     *   \param attention_mask [batch_size, 1, max_q_len, max_kv_len]
+     *   \param padding_offset [token_num], int
+     *   \param input_lengths [batch_size], int
+     *   \param history_lengths [batch_size], int
+     *   \param context_lengths [batch_size], int
+     *   \param cu_seqlens [batch_size+1], int
+     *   \param max_seq_len [1], int on cpu
+     *   \param is_final_layer [1], bool on cpu
+     *   \param layer_id [1], int on cpu
+     *
+     * output_tensors:
+     *   \param hidden_features [token_num, hidden_dim]
+     *   \param key_cache [batch_size], uint64
+     *   \param value_cache [batch_size], uint64
+     */
+    /////////////////////////////////////////////
+    /// parse inputs
+    const int batch_size = input_tensors->at("attention_mask").shape[0];
+    const int max_q_len  = input_tensors->at("attention_mask").shape[2];
+    const int max_k_len  = input_tensors->at("attention_mask").shape[3];
+    const int layer_id   = input_tensors->getVal<int>("layer_id");
+    const int num_token = input_tensors->at("input_query").shape[0];
+    const int max_seq_len = input_tensors->at("max_seq_len").getVal<int>();
+    T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
+    T* attention_input = input_tensors->at("input_query").getPtr<T>();
+    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+    const auto input_length   = input_tensors->at("input_lengths").getPtr<const int>();
+    const auto history_length = input_tensors->at("history_lengths").getPtr<const int>();
+    const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
+    int*       cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
+    const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
+    /////////////////////////////////////////////
+    /// allocate buffers
+    allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
+    //////////////////////////////////////////////
+    /// qkv gemm
+    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+    linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);
+    //////////////////////////////////////////////
+    /// transpose qkv & apply rotary embedding & rebuild padding
+    /// qkv [B, s, 3, H, D] -> (q [B, H, s, D], k [B, H, s, D], v [B, H, s, D])
+    invokeAddFusedQKVBiasTranspose(q_buf_2_,
+                                   k_buf_2_,
+                                   v_buf_2_,
+                                   PrefixPromptBatchWeightsParam<T>{},
+                                   qkv_buf_,
+                                   (const T*)nullptr,  // qkv_bias
+                                   padding_offset,     // padding_offset,
+                                   history_length,     // used for applying rotary embedding
+                                   batch_size,
+                                   max_q_len,  // seq_len
+                                   num_token,  // batch_size * seq_len
+                                   local_head_num_,
+                                   size_per_head_,
+                                   rotary_embedding_dim_,
+                                   neox_rotary_style_,
+                                   nullptr,  // query_weight.scale_out
+                                   0,        // int8 mode
+                                   stream_);
+    sync_check_cuda_error();
+    const size_t layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
+    auto k_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
+    auto v_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
+    //////////////////////////////////////////////////////////
+    /// insert the k/v computed from inputs into k/v cache
+    /// transpose kv -> kv cache
+    // put k/v_buf from shape [B, H, s, D] to
+    // k_buf_2 [B, H, s, D] -> key_cache [B, H, S[t:t+s], D/x, x]
+    // v_buf_2 [B, H, s, D] -> val_cache [B, H, S[t:t+s], D/x, x]
+    invokeExtendKVCache(k_cache_ptrs,
+                        v_cache_ptrs,
+                        layer_offset,
+                        k_buf_2_,
+                        v_buf_2_,
+                        batch_size,
+                        input_length,
+                        max_q_len,
+                        history_length,
+                        max_seq_len,
+                        size_per_head_,
+                        local_head_num_,
+                        stream_);
+    sync_check_cuda_error();
+    if (use_fmha_) {
+        fusedMultiHeadAttention(k_cache_ptrs,
+                                v_cache_ptrs,
+                                layer_offset,
+                                attention_mask,
+                                cu_seqlens,
+                                batch_size,
+                                max_q_len,
+                                max_k_len,
+                                max_seq_len);
+    }
+    else {
+        unfusedMultiHeadAttention(k_cache_ptrs,
+                                  v_cache_ptrs,
+                                  layer_offset,
+                                  attention_mask,
+                                  padding_offset,
+                                  context_length,
+                                  batch_size,
+                                  num_token,
+                                  max_q_len,
+                                  max_k_len,
+                                  max_seq_len);
+    }
+    //////////////////////////////////////////////
+    /// output gemm <Bs,HD> -> <Bs,HD>
+    linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+template<typename T>
+void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptrs,
+                                                            T**    val_cache_ptrs,
+                                                            size_t cache_layer_offset,
+                                                            T*     attention_mask,
+                                                            int*   cu_seqlens,
+                                                            int    batch_size,
+                                                            int    max_q_len,
+                                                            int    max_k_len,
+                                                            int    max_seq_len)
+{
+    //////////////////////////////////////////////
+    // flash attention
+    using AttentionOp = FlashAttentionOp<T>;
+    using Layout      = typename AttentionOp::AttentionLayout;
+    Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+                    .stride_seq   = int(size_per_head_),
+                    .stride_head  = int(max_q_len * size_per_head_)};
+    Layout layout_k{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+                    .stride_seq        = int(size_per_head_),
+                    .stride_head       = int(max_seq_len * size_per_head_),
+                    .batch_seqs_offset = int(cache_layer_offset),
+                    .batch_seqs        = key_cache_ptrs};
+    Layout layout_v{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+                    .stride_seq        = int(size_per_head_),
+                    .stride_head       = int(max_seq_len * size_per_head_),
+                    .batch_seqs_offset = int(cache_layer_offset),
+                    .batch_seqs        = val_cache_ptrs};
+    Layout layout_o{
+        .stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+        .stride_seq   = int(local_head_num_ * size_per_head_),
+        .stride_head  = int(size_per_head_),
+        .use_seqlens  = true,
+    };
+    AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
+    typename AttentionOp::Params attn_params{.attn_out     = qkv_buf_3_,
+                                             .query        = q_buf_2_,
+                                             .key          = k_cache_buf_,
+                                             .val          = v_cache_buf_,
+                                             .mask         = attention_mask,
+                                             .out_accum    = qk_buf_float_,
+                                             .cu_seqlens_q = cu_seqlens,
+                                             .cu_seqlens_k = nullptr,
+                                             .layout_q     = layout_q,
+                                             .layout_k     = layout_k,
+                                             .layout_v     = layout_v,
+                                             .layout_o     = layout_o};
+    //
+    flash_attention(attn_params, stream_);
+}
+template<typename T>
+void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cache_ptrs,
+                                                              T**        val_cache_ptrs,
+                                                              size_t     cache_layer_offset,
+                                                              const T*   attention_mask,
+                                                              const int* padding_offset,
+                                                              const int* context_length,
+                                                              int        batch_size,
+                                                              int        num_token,
+                                                              int        max_q_len,
+                                                              int        max_k_len,
+                                                              int        max_seq_len)
+{
+    // key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
+    // val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
+    invokeTransposeKVCache(k_cache_buf_,
+                           v_cache_buf_,
+                           (const T**)key_cache_ptrs,
+                           (const T**)val_cache_ptrs,
+                           cache_layer_offset,
+                           batch_size,
+                           context_length,  // history_len + input_len = context_len
+                           max_k_len,
+                           max_seq_len,
+                           size_per_head_,
+                           local_head_num_,
+                           stream_);
+    sync_check_cuda_error();
+    const T qk_scale = static_cast<T>(1.f / sqrtf(size_per_head_ * 1.f));
+    //////////////////////////////////////////////
+    /// Q*K batch gemm
+    /// -> [B, H, s, t + s]
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        max_k_len,                      // m
+                                        max_q_len,                      // n
+                                        size_per_head_,                 // k
+                                        k_cache_buf_,                   // A
+                                        size_per_head_,                 // lda
+                                        max_k_len * size_per_head_,     // strideA
+                                        q_buf_2_,                       // B
+                                        size_per_head_,                 // ldb
+                                        max_q_len * size_per_head_,     // strideB
+                                        qk_buf_,                        // C
+                                        max_k_len,                      // ldc
+                                        max_q_len * max_k_len,          // strideC
+                                        batch_size * local_head_num_);  // batchCount
+    //////////////////////////////////////////////
+    /// ! masked softmax (kernel asserts k_length <= 4096)
+    MaskedSoftmaxParam<T, T> param{};
+    param.attention_score    = qk_buf_;
+    param.qk                 = qk_buf_;
+    param.attention_mask     = attention_mask;
+    param.batch_size         = batch_size;
+    param.q_length           = max_q_len;
+    param.k_length           = max_k_len;
+    param.num_heads          = local_head_num_;
+    param.qk_scale           = qk_scale;
+    param.linear_bias_slopes = nullptr;
+    invokeMaskedSoftmax(param, stream_);
+    sync_check_cuda_error();
+    //////////////////////////////////////////////
+    /// softmax(QK)*V batch gemm
+    // -> [B, H, S, D]
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        size_per_head_,                 // m
+                                        max_q_len,                      // n
+                                        max_k_len,                      // k
+                                        v_cache_buf_,                   // A
+                                        size_per_head_,                 // lda
+                                        max_k_len * size_per_head_,     // strideA,
+                                        qk_buf_,                        // B
+                                        max_k_len,                      // ldb
+                                        max_k_len * max_q_len,          // strideB
+                                        qkv_buf_2_,                     // C
+                                        size_per_head_,                 // ldc,
+                                        max_q_len * size_per_head_,     // strideC
+                                        batch_size * local_head_num_);  // batchCount
+    //////////////////////////////////////////////
+    /// transpose <B,h,s,D> -> <B,s,h,D>
+    invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                             qkv_buf_3_,
+                                             num_token,
+                                             batch_size,
+                                             max_q_len,
+                                             local_head_num_,
+                                             size_per_head_,
+                                             padding_offset,
+                                             nullptr,
+                                             0,
+                                             stream_);
+    sync_check_cuda_error();
+}
+template class LlamaContextAttentionLayer<float>;
+template class LlamaContextAttentionLayer<half>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
+#pragma once
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaContextAttentionLayer {
+public:
+    void freeBuffer();
+    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
+    LlamaContextAttentionLayer(size_t           head_num,
+                               size_t           size_per_head,
+                               size_t           rotary_embedding_dim,
+                               bool             neox_rotary_style,
+                               NcclParam        tensor_para,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward,
+                               bool             use_fmha):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        hidden_units_(head_num * size_per_head),
+        local_head_num_(head_num / tensor_para.world_size_),
+        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        neox_rotary_style_(neox_rotary_style),
+        tensor_para_(tensor_para),
+        stream_(stream),
+        cublas_wrapper_(cublas_wrapper),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        use_fmha_(use_fmha)
+    {
+    }
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
+    void fusedMultiHeadAttention(T**    key_cache_ptrs,
+                                 T**    val_cache_ptrs,
+                                 size_t cache_layer_offset,
+                                 T*     attention_mask,
+                                 int*   cu_seqlens,
+                                 int    batch_size,
+                                 int    max_q_len,
+                                 int    max_k_len,
+                                 int    max_seq_len);
+    void unfusedMultiHeadAttention(T**        key_cache_ptrs,
+                                   T**        val_cache_ptrs,
+                                   size_t     cache_layer_offset,
+                                   const T*   attention_mask,
+                                   const int* padding_offset,
+                                   const int* context_length,
+                                   int        batch_size,
+                                   int        num_token,
+                                   int        max_q_len,
+                                   int        max_k_len,
+                                   int        max_seq_len);
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t local_head_num_;
+    const size_t local_hidden_units_;
+    const size_t rotary_embedding_dim_;
+    const bool   is_free_buffer_after_forward_;
+    const bool neox_rotary_style_;
+    const bool use_fmha_;
+    NcclParam tensor_para_;
+    cudaStream_t     stream_;
+    IAllocator*      allocator_;
+    cublasMMWrapper* cublas_wrapper_;
+    LlamaLinear<T>   linear_;
+    T*     qkv_buf_{};
+    T*     q_buf_2_{};
+    T*     k_buf_2_{};
+    T*     v_buf_2_{};
+    T*     k_cache_buf_{};
+    T*     v_cache_buf_{};
+    T*     qk_buf_{};
+    float* qk_buf_float_{};
+    T*     qkv_buf_2_{};
+    T*     qkv_buf_3_{};
+    bool is_allocate_buffer_ = false;
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/Tensor.h"
+namespace fastertransformer {
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    attn_ffn_io_    = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
+    attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
+    padding_offset_ = (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * max_q_len, false);
+    cu_seqlens_     = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false);
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaContextDecoder<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&attn_ffn_io_);
+        allocator_->free((void**)&padding_offset_);
+        allocator_->free((void**)&cu_seqlens_);
+        allocator_->free((void**)&attention_mask_);
+        allocator_->free((void**)&h_pinned_token_num_ptr_, true);
+        is_allocate_buffer_ = false;
+    }
+}
+template<typename T>
+void LlamaContextDecoder<T>::initialize(bool use_fmha)
+{
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+    context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
+                                                                 size_per_head_,
+                                                                 rotary_embedding_dim_,
+                                                                 false,  // neox_rotary_style
+                                                                 tensor_para_,
+                                                                 stream_,
+                                                                 cublas_wrapper_,
+                                                                 allocator_,
+                                                                 is_free_buffer_after_forward_,
+                                                                 use_fmha);
+    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
+                                           size_per_head_,
+                                           inter_size_,
+                                           tensor_para_,
+                                           stream_,
+                                           cublas_wrapper_,
+                                           allocator_,
+                                           is_free_buffer_after_forward_);
+}
+template<typename T>
+void LlamaContextDecoder<T>::forwardSelfAttn(const Session&                                 sess,
+                                             const std::unordered_map<std::string, Tensor>* input_tensors,
+                                             int                                            layer,
+                                             bool                                           is_final)
+{
+    // FT_LOG_ERROR(__PRETTY_FUNCTION__);
+    TensorMap self_attention_input_tensors{
+        {"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
+        {"attention_mask",
+         {MEMORY_GPU, data_type_, {sess.batch_size, 1, sess.max_query_len, sess.max_key_len}, attention_mask_}},
+        {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &layer}},
+        {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_final}},
+        {"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
+        {"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
+        {"history_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.history_length}},
+        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
+        {"max_seq_len", input_tensors->at("max_seq_len")}};
+    auto& k_cache = *sess.k_cache;
+    auto& v_cache = *sess.v_cache;
+    TensorMap self_attention_output_tensors{
+        {"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
+        {"key_cache", k_cache},
+        {"value_cache", v_cache},
+    };
+    context_attention_layer_->forward(&self_attention_output_tensors,  //
+                                      &self_attention_input_tensors,
+                                      &sess.weights->at(layer)->self_attn_weights);
+}
+template<typename T>
+LlamaContextDecoder<T>::LlamaContextDecoder(size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           inter_size,
+                                            size_t           num_layer,
+                                            size_t           rotary_embedding_dim,
+                                            float            rmsnorm_eps,
+                                            NcclParam        tensor_para,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward,
+                                            bool             use_fmha):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    hidden_units_(head_num * size_per_head),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    rmsnorm_eps_(rmsnorm_eps),
+    tensor_para_(tensor_para),
+    data_type_(getTensorType<T>())
+{
+    initialize(use_fmha);
+}
+template<typename T>
+LlamaContextDecoder<T>::~LlamaContextDecoder()
+{
+    delete context_attention_layer_;
+    delete silu_ffn_layer_;
+    freeBuffer();
+}
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                                     const std::vector<Tensor>*                      input_tensors,
+                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_CHECK(false);
+}
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                                     const std::unordered_map<std::string, Tensor>*  input_tensors,
+                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    /**
+     * input tensors:
+     *   \param decoder_input [num_token, hidden_units], float
+     *   \param input_lengths [batch_size], int
+     *   \param history_lengths [batch_size], int
+     *   \param context_legnths [batch_size], int
+     *   \param output_norm_weight [hidden_dims], float
+     *   \param max_q_len [1], int on cpu
+     *   \param max_kv_len [1], int on cpu
+     *   \param max_seq_len [1], int on cpu
+     *
+     * output tensors:
+     *   \param decoder_output [batch_size, seq_len, hidden_units],
+     *   \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
+     *   \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+     *   \param last_token_hidden_units [batch_size, hidden_units]
+     */
+    Session sess{};
+    sess.token_num     = input_tensors->at("decoder_input").shape[0];
+    sess.batch_size    = input_tensors->at("input_lengths").shape[0];
+    sess.max_query_len = input_tensors->at("max_q_len").getVal<int>();
+    sess.max_key_len   = input_tensors->at("max_kv_len").getVal<int>();
+    sess.weights       = decoder_layer_weights;
+    sess.input_length   = input_tensors->at("input_lengths").getPtr<int>();
+    sess.history_length = input_tensors->at("history_lengths").getPtr<int>();
+    sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
+    T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
+    // T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+    sess.k_cache = &output_tensors->at("key_cache");
+    sess.v_cache = &output_tensors->at("value_cache");
+    allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
+    size_t tmp_token_num{};
+    invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                       &tmp_token_num,  // updated token num
+                                       padding_offset_,
+                                       cu_seqlens_,
+                                       input_tensors->at("input_lengths").getPtr<int>(),
+                                       sess.batch_size,
+                                       sess.max_query_len,
+                                       stream_);
+    sync_check_cuda_error();
+    FT_CHECK(tmp_token_num == sess.token_num);
+    invokeCreateCausalMasks(attention_mask_,
+                            sess.input_length,
+                            sess.context_length,
+                            sess.max_query_len,
+                            sess.max_key_len,
+                            sess.batch_size,
+                            stream_);
+    sync_check_cuda_error();
+    /////////////////////////////////////////////
+    /// RMSNorm
+    invokeRootMeanSquareNorm(attn_ffn_io_,
+                             decoder_input_output,
+                             decoder_layer_weights->at(0)->self_attn_norm_weights,
+                             rmsnorm_eps_,
+                             sess.token_num,
+                             hidden_units_,
+                             stream_);
+    sync_check_cuda_error();
+    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        /////////////////////////////////////////////
+        /// self-attention
+        forwardSelfAttn(sess, input_tensors, layer, false);
+        invokeFusedAddResidualRMSNorm(decoder_input_output,
+                                      attn_ffn_io_,
+                                      decoder_layer_weights->at(layer)->ffn_norm_weights,
+                                      rmsnorm_eps_,
+                                      sess.token_num,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+        ////////////////////////////////////////////
+        /// feed-forward network
+        TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
+        TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
+        silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
+        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
+                                                     input_tensors->at("output_norm_weight").getPtr<T>();
+        invokeFusedAddResidualRMSNorm(decoder_input_output,  //
+                                      attn_ffn_io_,
+                                      scale_weight,
+                                      rmsnorm_eps_,
+                                      sess.token_num,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+    }
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+}
+template class LlamaContextDecoder<float>;
+template class LlamaContextDecoder<half>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
+#pragma once
+// #include "src/fastertransformer/kernels/add_residual_kernels.h"
+// #include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+// #include "src/fastertransformer/layers/FfnLayer.h"
+// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaContextDecoder: public BaseLayer {
+protected:
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
+    void freeBuffer() override;
+    void initialize(bool use_fmha);
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+    NcclParam tensor_para_;
+    T*   attn_ffn_io_{};
+    T*   attention_mask_{};
+    int* padding_offset_{};
+    int* cu_seqlens_{};  // cu for cumulative
+    size_t* h_pinned_token_num_ptr_{};
+    LlamaContextAttentionLayer<T>* context_attention_layer_{};
+    LlamaFfnLayer<T>*              silu_ffn_layer_{};
+    const DataType data_type_;
+    struct Session {
+        size_t  batch_size;
+        size_t  token_num;
+        size_t  max_query_len;
+        size_t  max_key_len;
+        Tensor* k_cache;
+        Tensor* v_cache;
+        int*    input_length{};
+        int*    history_length{};
+        int*    context_length{};
+        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
+    };
+    void forwardSelfAttn(const Session&                                 sess,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         int                                            layer,
+                         bool                                           is_final);
+public:
+    LlamaContextDecoder(size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           inter_size,
+                        size_t           num_layer,
+                        size_t           rotary_embedding_dim,
+                        float            rmsnorm_eps,
+                        NcclParam        tensor_para,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward,
+                        bool             use_fmha);
+    ~LlamaContextDecoder() override;
+    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                         const std::unordered_map<std::string, Tensor>*  input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+    virtual void forward(std::vector<Tensor>*                            output_tensors,
+                         const std::vector<Tensor>*                      input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+namespace fastertransformer {
+template<typename T>
+LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           inter_size,
+                              size_t           num_layer,
+                              size_t           rotary_embedding_dim,
+                              float            rmsnorm_eps,
+                              NcclParam        tensor_para,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    hidden_units_(head_num * size_per_head),
+    rmsnorm_eps_(rmsnorm_eps),
+    tensor_para_(tensor_para),
+    data_type_(getTensorType<T>())
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+template<typename T>
+LlamaDecoder<T>::~LlamaDecoder()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    delete self_attention_layer_;
+    delete silu_ffn_layer_;
+}
+template<typename T>
+void LlamaDecoder<T>::initialize()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
+                                                                  size_per_head_,
+                                                                  rotary_embedding_dim_,
+                                                                  false,  // neox_rotary_style
+                                                                  tensor_para_,
+                                                                  stream_,
+                                                                  cublas_wrapper_,
+                                                                  allocator_,
+                                                                  is_free_buffer_after_forward_);
+    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
+                                           size_per_head_,
+                                           inter_size_,
+                                           tensor_para_,
+                                           stream_,
+                                           cublas_wrapper_,
+                                           allocator_,
+                                           is_free_buffer_after_forward_);
+}
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaDecoder<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        is_allocate_buffer_ = false;
+    }
+}
+template<typename T>
+void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&                   sess,
+                                      T*                                             attn_io,
+                                      const std::unordered_map<std::string, Tensor>* input_tensors,
+                                      size_t                                         layer)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TensorMap self_attention_input_tensors(*input_tensors);
+    self_attention_input_tensors.insert("input_query",
+                                        {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
+    const int layer_id = layer;
+    self_attention_input_tensors.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
+    auto& k_cache = *sess.k_cache;
+    auto& v_cache = *sess.v_cache;
+    TensorMap self_attention_output_tensors{
+        {"attention_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}},
+        {"key_cache", k_cache},
+        {"value_cache", v_cache},
+    };
+    self_attention_layer_->forward(&self_attention_output_tensors,  //
+                                   &self_attention_input_tensors,
+                                   &sess.weights->at(layer)->self_attn_weights);
+}
+template<typename T>
+void LlamaDecoder<T>::forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer)
+{
+    TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
+    TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
+    silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &sess.weights->at(layer)->ffn_weights);
+}
+template<typename T>
+void LlamaDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                              const std::vector<Tensor>*                      input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_CHECK(false);
+}
+template<typename T>
+void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                              const std::unordered_map<std::string, Tensor>*  input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    /**
+     * input_tensors:
+     *   \param decoder_input [batch_size, hidden_dims]
+     *   \param sequence_lengths [batch_size] int
+     *   \param output_norm_weight [hidden_dims]
+     *   \param step [1] on cpu
+     *   \param ite [1] on cpu
+     *   \param finished [batch_size] bool
+     *   \param total_padding_tokens [batch_size], int
+     *   \param max_seq_len [1] on cpu
+     *   \param masked_tokens [batch_size, memory_len] bool (optional), NOT USED YET
+     *
+     * output_tensors:
+     *   \param decoder_output [batch_size, hidden_dimension]
+     *   \param key_cache [batch_size] uint64_t
+     *   \param value_cache [batch_size] uint64_t
+     */
+    // for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
+    Session sess{};
+    sess.batch_size = input_tensors->at("decoder_input").shape[0];
+    sess.weights    = decoder_layer_weights;
+    allocateBuffer(sess.batch_size);
+    sess.ite     = input_tensors->at("ite").getVal<const int>();
+    sess.k_cache = &output_tensors->at("key_cache");
+    sess.v_cache = &output_tensors->at("value_cache");
+    sess.max_memory_len = input_tensors->at("max_seq_len").getVal<int>();
+    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+    ////////////////////////////////////////////
+    /// RMSNorm
+    invokeRootMeanSquareNorm(decoder_output,
+                             decoder_input,
+                             decoder_layer_weights->at(0)->self_attn_norm_weights,
+                             rmsnorm_eps_,
+                             sess.batch_size,
+                             hidden_units_,
+                             stream_);
+    sync_check_cuda_error();
+    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
+        forwardSelfAttn(sess, decoder_output, input_tensors, layer);
+        invokeFusedAddResidualRMSNorm(decoder_input,
+                                      decoder_output,
+                                      decoder_layer_weights->at(layer)->ffn_norm_weights,
+                                      rmsnorm_eps_,
+                                      sess.batch_size,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+        // decoder_layer_output_ = ffn(decoder_normed_input_)
+        forwardFfn(sess, decoder_output, layer);
+        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
+                                                     input_tensors->at("output_norm_weight").getPtr<T>();
+        invokeFusedAddResidualRMSNorm(decoder_input,  //
+                                      decoder_output,
+                                      scale_weight,
+                                      rmsnorm_eps_,
+                                      sess.batch_size,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+    }
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+}
+template class LlamaDecoder<half>;
+template class LlamaDecoder<float>;
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
+#include "src/fastertransformer/layers/BaseLayer.h"
+// #include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaDecoder: public BaseLayer {
+protected:
+    void allocateBuffer() override;  // deprecated
+    void allocateBuffer(size_t batch_size);
+    void freeBuffer() override;
+    void initialize();
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+    NcclParam tensor_para_;
+    LlamaDecoderSelfAttentionLayer<T>* self_attention_layer_{};
+    LlamaFfnLayer<T>*                  silu_ffn_layer_{};
+    const DataType data_type_;
+    struct Session {
+        size_t                                          batch_size;
+        int                                             ite;
+        size_t                                          max_memory_len;
+        Tensor*                                         k_cache;
+        Tensor*                                         v_cache;
+        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
+    };
+    void forwardSelfAttn(const Session&                                 sess,
+                         T*                                             attn_io,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         size_t                                         layer);
+    void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);
+public:
+    LlamaDecoder(size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           inter_size,
+                 size_t           num_layer,
+                 size_t           rotary_embedding_dim,
+                 float            rmsnorm_eps,
+                 NcclParam        tensor_para,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward);
+    ~LlamaDecoder() override;
+    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                         const std::unordered_map<std::string, Tensor>*  input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+    virtual void forward(std::vector<Tensor>*                            output_tensors,
+                         const std::vector<Tensor>*                      input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+namespace fastertransformer {
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(
+    size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    weight_type_(weight_type),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank)
+{
+    self_attn_weights.qkv.input_dims     = hidden_units_;
+    self_attn_weights.qkv.output_dims    = 3 * hidden_units_ / tensor_para_size_;
+    self_attn_weights.qkv.type           = weight_type;
+    self_attn_weights.output.input_dims  = hidden_units_ / tensor_para_size_;
+    self_attn_weights.output.output_dims = hidden_units_;
+    self_attn_weights.output.type        = weight_type;
+    ffn_weights.gating.input_dims        = hidden_units_;
+    ffn_weights.gating.output_dims       = inter_size_ / tensor_para_size_;
+    ffn_weights.gating.type              = weight_type;
+    ffn_weights.intermediate.input_dims  = hidden_units_;
+    ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
+    ffn_weights.intermediate.type        = weight_type;
+    ffn_weights.output.input_dims        = inter_size_ / tensor_para_size_;
+    ffn_weights.output.output_dims       = hidden_units_;
+    ffn_weights.output.type              = weight_type;
+    mallocWeights();
+}
+template<typename T>
+void freeWeights(LlamaDenseWeight<T>& weights)
+{
+    cudaFree(weights.kernel);
+    cudaFree(weights.bias);
+    cudaFree(weights.scales);
+    cudaFree(weights.zeros);
+    weights.kernel = nullptr;
+    weights.bias   = nullptr;
+    weights.scales = nullptr;
+    weights.zeros  = nullptr;
+}
+template<typename T>
+void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
+{
+    if (bias) {
+        deviceMalloc((T**)&weights.bias, weights.output_dims);
+    }
+    const size_t bit_size = getBitSize(weights.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+        FT_CHECK(weights.input_dims % factor == 0);
+        deviceMalloc((float**)&weights.kernel, weights.input_dims / factor * weights.output_dims);
+        deviceMalloc((T**)&weights.scales, weights.output_dims);
+        deviceMalloc((T**)&weights.zeros, weights.output_dims);
+    }
+}
+template<typename T>
+void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, int rank, FtCudaDataType model_file_type)
+{
+    prefix += "." + std::to_string(rank);
+    const auto type = model_file_type;
+    if (w.bias) {
+        loadWeightFromBin((T*)w.bias, {w.output_dims}, prefix + ".bias", type);
+    }
+    const size_t bit_size = getBitSize(w.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        loadWeightFromBin((T*)w.kernel, {w.input_dims, w.output_dims}, prefix + ".weight", type);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+        FT_CHECK(w.input_dims % factor == 0);
+        const auto f32_type = FtCudaDataType::FP32;
+        loadWeightFromBin((float*)w.kernel, {w.input_dims / factor, w.output_dims}, prefix + ".qweight", f32_type);
+        loadWeightFromBin((T*)w.scales, {w.output_dims}, prefix + ".scales", type);
+        loadWeightFromBin((T*)w.zeros, {w.output_dims}, prefix + ".zeros", type);
+    }
+}
+template<typename T>
+void LlamaDecoderLayerWeight<T>::mallocWeights()
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+    fastertransformer::mallocWeights(self_attn_weights.qkv, false);
+    fastertransformer::mallocWeights(self_attn_weights.output, false);
+    fastertransformer::mallocWeights(ffn_weights.gating, false);
+    fastertransformer::mallocWeights(ffn_weights.intermediate, false);
+    fastertransformer::mallocWeights(ffn_weights.output, false);
+}
+template<typename T>
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
+{
+    cudaFree((void*)self_attn_norm_weights);
+    cudaFree((void*)ffn_norm_weights);
+    freeWeights(self_attn_weights.qkv);
+    freeWeights(self_attn_weights.output);
+    freeWeights(ffn_weights.gating);
+    freeWeights(ffn_weights.intermediate);
+    freeWeights(ffn_weights.output);
+}
+template<typename T>
+void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+{
+    const auto rank_spec = std::to_string(tensor_para_rank_);
+    const auto type      = model_file_type;
+    loadWeightFromBin(
+        (T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type);
+    loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);
+    loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type);
+    loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type);
+    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type);
+    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type);
+    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type);
+}
+template struct LlamaDecoderLayerWeight<float>;
+template struct LlamaDecoderLayerWeight<half>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
+#pragma once
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+namespace fastertransformer {
+template<typename T>
+struct LlamaDecoderLayerWeight {
+public:
+    LlamaDecoderLayerWeight() = delete;
+    LlamaDecoderLayerWeight(
+        size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank);
+    ~LlamaDecoderLayerWeight();
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
+    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
+    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+    T*                      self_attn_norm_weights{};
+    T*                      ffn_norm_weights{};
+    LlamaAttentionWeight<T> self_attn_weights{};
+    LlamaFfnWeight<T>       ffn_weights{};
+private:
+    size_t hidden_units_;
+    size_t inter_size_;
+    WeightType weight_type_;
+    size_t bit_size_;
+    size_t tensor_para_size_;
+    size_t tensor_para_rank_;
+    bool   is_maintain_buffer_ = false;
+    void mallocWeights();
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
+#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include <string>
+// #include <glog/logging.h>
+namespace fastertransformer {
+template<typename T>
+struct SATypeConverter {
+    using Type = T;
+};
+template<>
+struct SATypeConverter<half> {
+    using Type = uint16_t;
+};
+template<typename T>
+static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
+                                                      const T*     qkv_bias,
+                                                      const T*     relative_attention_bias,
+                                                      T*           key_cache,
+                                                      T*           value_cache,
+                                                      T**          k_cache_per_sample,
+                                                      T**          v_cache_per_sample,
+                                                      size_t       kv_cache_per_sample_offset,
+                                                      const int*   cache_indir,
+                                                      T*           context_buf,
+                                                      const bool*  finished,
+                                                      const int*   sequence_lengths,
+                                                      const int    max_batch_size,
+                                                      const int    inference_batch_size,
+                                                      const int    beam_width,
+                                                      const int    head_num,
+                                                      const int    size_per_head,
+                                                      const int    rotary_embedding_dim,
+                                                      const int    memory_max_len,
+                                                      const int*   prefix_prompt_lengths,
+                                                      const int    max_prefix_prompt_length,
+                                                      const int    max_input_len,
+                                                      const int*   total_padding_tokens,
+                                                      const int    step,
+                                                      const float  q_scaling,
+                                                      const int    relative_attention_bias_stride,
+                                                      const T*     linear_bias_slopes,
+                                                      const bool*  masked_tokens,
+                                                      const int*   ia3_tasks,
+                                                      const T*     ia3_key_weights,
+                                                      const T*     ia3_value_weights,
+                                                      const float* qkv_scale_out,
+                                                      const float* attention_out_scale,
+                                                      const int    int8_mode,
+                                                      cudaStream_t stream)
+{
+    using DataType = typename SATypeConverter<T>::Type;
+    // Prepare the parameters.
+    Masked_multihead_attention_params<DataType> params;
+    memset(&params, 0, sizeof(params));
+    int hidden_units = head_num * size_per_head;
+    if (qkv_bias != nullptr) {
+        params.q_bias = reinterpret_cast<const DataType*>(qkv_bias);
+        params.k_bias = reinterpret_cast<const DataType*>(qkv_bias) + hidden_units;
+        params.v_bias = reinterpret_cast<const DataType*>(qkv_bias) + 2 * hidden_units;
+    }
+    else {
+        params.q_bias = nullptr;
+        params.k_bias = nullptr;
+        params.v_bias = nullptr;
+    }
+    // Set the output buffer.
+    params.out = reinterpret_cast<DataType*>(context_buf);
+    // Set the input buffers.
+    params.q = reinterpret_cast<const DataType*>(qkv_buf);
+    if (int8_mode != 2) {
+        params.k = reinterpret_cast<const DataType*>(qkv_buf) + hidden_units;
+        params.v = reinterpret_cast<const DataType*>(qkv_buf) + 2 * hidden_units;
+    }
+    else {
+        params.k = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + hidden_units);
+        params.v = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + 2 * hidden_units);
+    }
+    params.stride   = 3 * hidden_units;
+    params.finished = const_cast<bool*>(finished);
+    params.k_cache                    = reinterpret_cast<DataType*>(key_cache);
+    params.v_cache                    = reinterpret_cast<DataType*>(value_cache);
+    params.k_cache_per_sample         = reinterpret_cast<DataType**>(k_cache_per_sample);
+    params.v_cache_per_sample         = reinterpret_cast<DataType**>(v_cache_per_sample);
+    params.kv_cache_per_sample_offset = kv_cache_per_sample_offset;
+    params.k_cache_interleaved        = false;
+    params.cache_indir                = cache_indir;
+    params.batch_size                 = inference_batch_size;
+    params.beam_width                 = beam_width;
+    params.memory_max_len             = memory_max_len;
+    params.prefix_prompt_lengths      = prefix_prompt_lengths;
+    params.max_prefix_prompt_length   = max_prefix_prompt_length;
+    params.length_per_sample          = sequence_lengths;  // max_input_length + current output length
+    // timestep adding max_prefix_prompt_length for shared memory size calculation and rotary embedding computation
+    params.timestep             = step + max_prefix_prompt_length - 1;
+    params.num_heads            = head_num;
+    params.hidden_size_per_head = size_per_head;
+    params.rotary_embedding_dim = rotary_embedding_dim;
+    // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
+    params.inv_sqrt_dh = 1.F / (sqrtf((float)params.hidden_size_per_head) * q_scaling);
+    params.total_padding_tokens = total_padding_tokens;
+    if (relative_attention_bias != nullptr) {
+        params.relative_attention_bias = reinterpret_cast<const DataType*>(relative_attention_bias);
+    }
+    params.relative_attention_bias_stride = relative_attention_bias_stride;
+    params.masked_tokens                  = masked_tokens;
+    // The slope of linear position bias per head, e.g., ALiBi.
+    if (linear_bias_slopes != nullptr) {
+        params.linear_bias_slopes = reinterpret_cast<const DataType*>(linear_bias_slopes);
+    }
+    params.max_input_length = max_input_len;
+    params.ia3_tasks         = ia3_tasks;
+    params.ia3_key_weights   = reinterpret_cast<const DataType*>(ia3_key_weights);
+    params.ia3_value_weights = reinterpret_cast<const DataType*>(ia3_value_weights);
+    params.int8_mode = int8_mode;
+    if (int8_mode == 2) {
+        params.qkv_scale_out       = qkv_scale_out;
+        params.attention_out_scale = attention_out_scale;
+    }
+    PUSH_RANGE("scaled dot-product fusion");
+    masked_multihead_attention(params, stream);
+    POP_RANGE;
+}
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    qkv_buf_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
+    context_buf_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(context_buf_, sizeof(T) * batch_size * local_hidden_units_, false));
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&context_buf_));
+        // allocator_->free((void**)(&k_cache_buf_));
+        // allocator_->free((void**)(&v_cache_buf_));
+        is_allocate_buffer_ = false;
+    }
+}
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     output_tensors,
+                                                const TensorMap*               input_tensors,
+                                                const LlamaAttentionWeight<T>* weights)
+{
+    /**
+     * input tensors:
+     *    \param input_query [batch_size, hidden_units],
+     *    \param sequence_lengths [batch_size]
+     *    \param step [1] on cpu
+     *    \param finished [batch_size]
+     *    \param total_padding_tokens [batch_size]
+     *    \param layer_id [1], int on cpu
+     *    \param max_seq_len [1] on cpu
+     *    \param masked_tokens [batch_size, memory_len], (optional), NOT USED YET
+     *    \param cache_indirection [batch_size / beam_width, beam_width, memory_max_len] (optional)
+     *
+     * output tensors:
+     *    \param attention_output [batch_size, hidden_units],
+     *    \param key_cache [batch, local_head_num, size_per_head / x, memory_max_len, x]
+     *    \param value_cache [batch, local_head_num, memory_max_len, size_per_head]
+     */
+    const T*    input_query_data      = input_tensors->getPtr<T>("input_query");
+    const int*  sequence_lengths_data = input_tensors->getPtr<int>("sequence_lengths");
+    const int*  total_padding_len     = input_tensors->getPtr<int>("total_padding_tokens");
+    const bool* finished_data         = input_tensors->getPtr<bool>("finished", nullptr);
+    const bool* masked_tokens_data    = input_tensors->getPtr<bool>("masked_tokens", nullptr);
+    const int*  cache_indir           = input_tensors->getPtr<int>("cache_indirection", nullptr);
+    T*  hidden_features_data = output_tensors->getPtr<T>("attention_output");
+    T** key_cache_ptrs       = output_tensors->getPtr<T*>("key_cache");
+    T** value_cache_ptrs     = output_tensors->getPtr<T*>("value_cache");
+    const int layer_id = input_tensors->getVal<int>("layer_id");
+    const int max_seq_len = input_tensors->getVal<int>("max_seq_len");
+    const int step        = input_tensors->getVal<int>("step");
+    const int step_1 = step - 1;
+    const int batch_size = input_tensors->at("input_query").shape[0];
+    const int beam_width = cache_indir != nullptr ? input_tensors->at("cache_indirection").shape[1] : 1;
+    allocateBuffer(batch_size, step, max_seq_len);
+    PUSH_RANGE("qkv_gemm");
+    linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
+    POP_RANGE;
+    const auto kv_cache_layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
+    const int  memory_len            = max_seq_len;
+    fusedQKV_masked_attention_dispatch<T>(
+        qkv_buf_,
+        nullptr,  // query_weight.bias,
+        nullptr,  // relative_attention_bias,
+        nullptr,
+        nullptr,
+        key_cache_ptrs,
+        value_cache_ptrs,
+        kv_cache_layer_offset,
+        cache_indir,
+        context_buf_,
+        finished_data,
+        sequence_lengths_data,  // NOTE: current seq len including padding (fixed after meeting the finished id)
+        batch_size,
+        batch_size,
+        beam_width,
+        local_head_num_,
+        size_per_head_,
+        rotary_embedding_dim_,
+        memory_len,
+        nullptr,  // prefix_prompt_lengths
+        0,        // max_prefix_prompt_length
+        0,        // max_input_length, not used w/o linear_bias_slopes
+        input_tensors->getPtr<int>("total_padding_tokens", nullptr),
+        step,
+        1.f,      // q_scaling
+        0,        // relative_attention_bias_stride
+        nullptr,  // linear_bias_slopes
+        nullptr,  //  masked_tokens_data,
+        nullptr,  // ia3_tasks
+        nullptr,  // ia3_key_weights
+        nullptr,  // ia3_value_weights
+        nullptr,  // qkv_scale_out
+        nullptr,  // attention_out_scale
+        0,        // int8_mode
+        stream_);
+    sync_check_cuda_error();
+    linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(
+            hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    // LOG(WARNING);
+}
+template class LlamaDecoderSelfAttentionLayer<float>;
+template class LlamaDecoderSelfAttentionLayer<half>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
+#pragma once
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaDecoderSelfAttentionLayer {
+public:
+    void freeBuffer();
+    void allocateBuffer(size_t batch_size, int key_len, int max_memory_len);
+    LlamaDecoderSelfAttentionLayer(size_t           head_num,
+                                   size_t           size_per_head,
+                                   size_t           rotary_embedding_dim,
+                                   bool             neox_rotary_style,
+                                   NcclParam        tensor_para,
+                                   cudaStream_t     stream,
+                                   cublasMMWrapper* cublas_wrapper,
+                                   IAllocator*      allocator,
+                                   bool             is_free_buffer_after_forward):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        hidden_units_(head_num * size_per_head),
+        local_head_num_(head_num / tensor_para.world_size_),
+        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        neox_rotary_style_(neox_rotary_style),
+        tensor_para_(tensor_para),
+        stream_(stream),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+    {
+    }
+    ~LlamaDecoderSelfAttentionLayer()
+    {
+        freeBuffer();
+    }
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t local_head_num_;
+    const size_t local_hidden_units_;
+    const size_t rotary_embedding_dim_;
+    const bool   is_free_buffer_after_forward_;
+    const bool neox_rotary_style_;
+    NcclParam tensor_para_;
+    cudaStream_t   stream_;
+    IAllocator*    allocator_;
+    LlamaLinear<T> linear_;
+    T* qkv_buf_     = nullptr;
+    T* context_buf_ = nullptr;
+    // T*   weight_buf_  = nullptr;
+    // T* k_cache_buf_{};
+    // T* v_cache_buf_{};
+    // T* tmp_k_cache_buf_{};
+    // T* tmp_v_cache_buf_{};
+    // T* tmp_cache_buf_{};
+    bool is_allocate_buffer_{};
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaDenseWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDenseWeight.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
+#pragma once
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+namespace fastertransformer {
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kINT8,
+    kINT4
+};
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+}
+template<typename T>
+struct LlamaDenseWeight {
+    size_t     input_dims;
+    size_t     output_dims;
+    void*      kernel;
+    WeightType type;
+    T*         bias;
+    T*         scales;
+    T*         zeros;
+};
+template<typename T>
+struct LlamaAttentionWeight {
+    LlamaDenseWeight<T> qkv;
+    LlamaDenseWeight<T> output;
+};
+template<typename T>
+struct LlamaFfnWeight {
+    LlamaDenseWeight<T> gating;
+    LlamaDenseWeight<T> intermediate;
+    LlamaDenseWeight<T> output;
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaFfnLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+// #include <glog/logging.h>
+namespace fastertransformer {
+template<typename T>
+void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
+{
+    inter_buf_          = (T*)allocator_->reMalloc(inter_buf_, sizeof(T) * token_num * inter_size_, false);
+    gating_buf_         = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * token_num * inter_size_, false);
+    is_allocate_buffer_ = true;
+}
+template<typename T>
+void LlamaFfnLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&inter_buf_);
+        allocator_->free((void**)&gating_buf_);
+        is_allocate_buffer_ = false;
+    }
+}
+template<typename T>
+void LlamaFfnLayer<T>::activation(int num_token)
+{
+    invokeGenericActivation<SiluActivation>(gating_buf_,
+                                            (const T*)nullptr,  // bias
+                                            inter_buf_,
+                                            (const T*)nullptr,  // gated_bias
+                                            nullptr,            // ia3_tasks
+                                            (const T*)nullptr,  // ia3_weights
+                                            num_token,          // m
+                                            inter_size_,        // n
+                                            0,                  // int8_mode
+                                            nullptr,            // activation_in
+                                            nullptr,            // activation_out
+                                            nullptr,            // padding_offset
+                                            0,                  // seq_len
+                                            stream_);
+    sync_check_cuda_error();
+}
+template<typename T>
+void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
+                               const TensorMap*         input_tensors,
+                               const LlamaFfnWeight<T>* weights)
+{
+    /**
+     * input_tensors:
+     *   \param ffn_input [token_num, hidden_dimension]
+     *
+     * output_tensors:
+     *   \param ffn_output [token_num, hidden_dimension]
+     */
+    const size_t num_token = input_tensors->at("ffn_input").shape[0];
+    // LOG(WARNING);
+    allocateBuffer(num_token);
+    const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
+    T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
+    PUSH_RANGE("ffn");
+    // TODO: fuse the two GEMMs with activation
+    linear_.forward(gating_buf_, ffn_input_data, num_token, weights->gating);
+    linear_.forward(inter_buf_, ffn_input_data, num_token, weights->intermediate);
+    activation(num_token);
+    linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output);
+    POP_RANGE;
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    // LOG(WARNING);
+}
+template class LlamaFfnLayer<float>;
+template class LlamaFfnLayer<half>;
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaFfnLayer.h
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
+#pragma once
+// #include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <functional>
+namespace fastertransformer {
+template<typename T>
+class LlamaFfnLayer {
+public:
+    LlamaFfnLayer(size_t           head_num,
+                  size_t           size_per_head,
+                  size_t           inter_size,
+                  NcclParam        tensor_para,
+                  cudaStream_t     stream,
+                  cublasMMWrapper* cublas_wrapper,
+                  IAllocator*      allocator,
+                  bool             is_free_buffer_after_forward):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size / tensor_para.world_size_),
+        hidden_units_(head_num * size_per_head),
+        stream_(stream),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        tensor_para_(tensor_para),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+    {
+    }
+    ~LlamaFfnLayer()
+    {
+        freeBuffer();
+    }
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
+private:
+    void allocateBuffer(size_t token_num);
+    void freeBuffer();
+    void activation(int num_token);
+    size_t         head_num_;
+    size_t         size_per_head_;
+    size_t         inter_size_;
+    size_t         hidden_units_;
+    cudaStream_t   stream_;
+    LlamaLinear<T> linear_;
+    IAllocator*    allocator_;
+    bool           is_free_buffer_after_forward_;
+    T* gating_buf_{};
+    T* inter_buf_{};
+    NcclParam tensor_para_;
+    bool is_allocate_buffer_{};
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaInstanceComm.h
+++ b/src/fastertransformer/models/llama/LlamaInstanceComm.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+#include "src/fastertransformer/models/llama/Barrier.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+namespace fastertransformer {
+class LlamaInstanceComm: public AbstractInstanceComm {
+public:
+    LlamaInstanceComm(int count): barrier_(count) {}
+    void barrier() override
+    {
+        barrier_.wait();
+    }
+    void setSharedObject(void* p) override
+    {
+        ptr = p;
+    }
+    void* getSharedObject() override
+    {
+        return ptr;
+    }
+private:
+    Barrier barrier_;
+    void*   ptr{};
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaLinear.h
+++ b/src/fastertransformer/models/llama/LlamaLinear.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+namespace fastertransformer {
+template<typename T>
+class LlamaLinear {
+public:
+    LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
+    {
+    }
+    void forward(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        switch (weight.type) {
+            case WeightType::kFP16:
+            case WeightType::kFP32:
+                forwardFp(output_data, input_data, batch_size, weight);
+                break;
+            case WeightType::kINT4:
+                forwardInt4(output_data, input_data, batch_size, weight);
+                break;
+            default:
+                FT_CHECK(0);
+        }
+    }
+private:
+    void forwardFp(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              weight.output_dims,
+                              batch_size,
+                              weight.input_dims,
+                              (const T*)weight.kernel,
+                              weight.output_dims,
+                              input_data,
+                              weight.input_dims,
+                              output_data,
+                              weight.output_dims);
+        sync_check_cuda_error();
+    }
+    void forwardInt4(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        FT_CHECK_WITH_INFO(0, "Not implemented");
+    }
+private:
+    cublasMMWrapper* cublas_wrapper_;
+    cudaStream_t     stream_{};
+};
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaNcclGuard.h
+++ b/src/fastertransformer/models/llama/LlamaNcclGuard.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <cuda_runtime.h>
+#include <mutex>
+namespace fastertransformer {
+struct NcclGuard {
+    static constexpr int kMaxGroupCount = 32;
+    static std::mutex& globalNcclMutex()
+    {
+        static std::mutex inst;
+        return inst;
+    }
+    struct GroupState {
+        std::mutex              mutex;
+        std::condition_variable cv;
+        int                     ref_count;
+    };
+    static GroupState& groupState(int group_id)
+    {
+        static std::array<GroupState, kMaxGroupCount> array{};
+        FT_CHECK(group_id < kMaxGroupCount);
+        return array[group_id];
+    }
+    NcclGuard(NcclParam tensor_para, cudaStream_t stream, bool barrier = false):
+        tensor_para_(tensor_para), stream_(stream), barrier_(barrier)
+    {
+        if (is_active()) {
+            auto& group = groupState(tensor_para_.group_id_);
+            if (tensor_para_.rank_ == 0) {
+                /// TODO: use std::optional after switching to C++17
+                global_nccl_lock_ = std::make_unique<std::lock_guard<std::mutex>>(globalNcclMutex());
+                {
+                    std::lock_guard<std::mutex> lock(group.mutex);
+                    group.ref_count = tensor_para_.world_size_;
+                }
+                group.cv.notify_all();
+            }
+            else {
+                std::unique_lock<std::mutex> lock(group.mutex);
+                group.cv.wait(lock, [&] { return group.ref_count > 0; });
+            }
+        }
+    }
+    ~NcclGuard()
+    {
+        if (is_active()) {
+            ftNcclStreamSynchronize(tensor_para_, NcclParam{}, stream_);
+            auto& group = groupState(tensor_para_.group_id_);
+            int value = -1;
+            {
+                std::lock_guard<std::mutex> lock(group.mutex);
+                value = --group.ref_count;
+            }
+            if (value == 0) {
+                group.cv.notify_all();
+            }
+            else if (barrier_ || tensor_para_.rank_ == 0) {
+                std::unique_lock<std::mutex> lock(group.mutex);
+                group.cv.wait(lock, [&] { return group.ref_count == 0; });
+            }
+            // rank 0 unlocks global NCCL mutex automatically
+        }
+    }
+    bool is_active()
+    {
+        return barrier_ || (ftNcclGroupCount() > 1 && tensor_para_.world_size_ > 1);
+    }
+    NcclParam                                    tensor_para_;
+    cudaStream_t                                 stream_;
+    bool                                         barrier_;
+    std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
+};
+}  // namespace fastertransformer
\ No newline at end of file