check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/models/llama/LlamaV2.cc
+++ b/src/fastertransformer/models/llama/LlamaV2.cc
+/* 
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaV2<T>::LlamaV2(size_t                       head_num,
+                    size_t                       size_per_head,
+                    size_t                       inter_size,
+                    size_t                       num_layer,
+                    size_t                       vocab_size,
+                    size_t                       rotary_embedding_dim,
+                    float                        norm_eps,
+                    int                          max_batch_size,
+                    int                          max_context_token_num,
+                    int                          session_len,
+                    int                          step_length,
+                    int                          start_id,
+                    int                          end_id,
+                    int                          cache_max_entry_count,
+                    int                          cache_chunk_size,
+                    bool                         use_context_fmha,
+                    std::shared_ptr<SharedState> shared_state,
+                    LlamaWeight<T>*              weights,
+                    NcclParam                    tensor_para,
+                    cudaStream_t                 stream,
+                    cublasMMWrapper*             cublas_wrapper,
+                    IAllocator*                  allocator,
+                    bool                         is_free_buffer_after_forward,
+                    cudaDeviceProp*              cuda_device_prop):
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    rmsnorm_eps_(norm_eps),
+    start_id_(start_id),
+    end_id_(end_id),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(head_num / tensor_para.world_size_),
+    weights_(weights),
+    tensor_para_(tensor_para),
+    stream_(stream),
+    cublas_wrapper_(cublas_wrapper),
+    allocator_(allocator),
+    is_free_buffer_after_forward_(is_free_buffer_after_forward),
+    cuda_device_prop_(cuda_device_prop),
+    debug_(isDebug()),
+    step_length_(step_length),
+    batch_(max_batch_size, max_context_token_num, session_len, this),
+    shared_state_(shared_state)
+
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
+    FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
+
+    kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
+                                                        local_head_num_,
+                                                        size_per_head_,
+                                                        session_len,
+                                                        sizeof(T) * 8,
+                                                        cache_max_entry_count,
+                                                        cache_chunk_size,
+                                                        tensor_para.rank_,
+                                                        allocator);
+    initialize(use_context_fmha);
+    start();
+}
+
+template<typename T>
+LlamaV2<T>::~LlamaV2()
+{
+    internal_thread_.join();
+
+    delete decoder_;
+    delete dynamic_decode_layer_;
+    delete context_decoder_;
+}
+
+template<typename T>
+void LlamaV2<T>::initialize(bool use_context_fmha)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    context_decoder_ = new LlamaContextDecoder<T>(head_num_,
+                                                  size_per_head_,
+                                                  inter_size_,
+                                                  num_layer_,
+                                                  rotary_embedding_dim_,
+                                                  rmsnorm_eps_,
+                                                  tensor_para_,
+                                                  stream_,
+                                                  cublas_wrapper_,
+                                                  allocator_,
+                                                  is_free_buffer_after_forward_,
+                                                  use_context_fmha);
+
+    decoder_ = new LlamaDecoder<T>(head_num_,
+                                   size_per_head_,
+                                   inter_size_,
+                                   num_layer_,
+                                   rotary_embedding_dim_,
+                                   rmsnorm_eps_,
+                                   tensor_para_,
+                                   stream_,
+                                   cublas_wrapper_,
+                                   allocator_,
+                                   is_free_buffer_after_forward_);
+
+    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
+                                                          vocab_size_,  // vocab_size_padded,
+                                                          0,            // end_id, deprecated
+                                                          stream_,
+                                                          cublas_wrapper_,
+                                                          allocator_,
+                                                          is_free_buffer_after_forward_,
+                                                          cuda_device_prop_);
+}
+
+template<typename T>
+void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // ! This kernel can't be used in context decoding
+    invokeEmbeddingLookupPosEncodingPadCount(embeddings,
+                                             weights_->pre_decoder_embedding_table,
+                                             static_cast<T*>(nullptr),  // position encoding
+                                             token_ids_buf,
+                                             static_cast<int*>(nullptr),  // padding count, not used w/o pos-code
+                                             batch_size,
+                                             hidden_units_,
+                                             static_cast<T>(1.),  // scale
+                                             step,                // step, used int index into output_ids_buf_
+                                             batch_size,          // token_num
+                                             0,                   // ite
+                                             stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaV2<T>::contextDecode(T*         deocder_output,
+                               uintptr_t* k_cache_ptr,
+                               uintptr_t* v_cache_ptr,
+                               T*         context_decoder_input_buf,
+                               T*         context_decoder_output_buf,
+                               const int* input_ids,
+                               const int* input_length,
+                               const int* history_length,
+                               const int* context_length,
+                               size_t     token_num,
+                               size_t     max_input_len,
+                               size_t     max_context_len,
+                               size_t     session_len,
+                               size_t     batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    if (tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("context decoding start");
+    }
+
+    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
+                                             nullptr,  // processed somewhere else
+                                             weights_->pre_decoder_embedding_table,
+                                             static_cast<T*>(nullptr),
+                                             pPromptTuningParam<T>{},
+                                             input_ids,
+                                             0,  // only used for postion encoding
+                                             token_num,
+                                             token_num,
+                                             1,
+                                             hidden_units_,
+                                             stream_);
+    sync_check_cuda_error();
+
+    const auto dtype = getTensorType<T>();
+    const auto bsz   = batch_size;
+
+    const int max_q_len   = max_input_len;
+    const int max_kv_len  = max_context_len;
+    const int max_seq_len = session_len;
+
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_input_buf}},
+        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, input_length}},
+        {"history_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, history_length}},
+        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, context_length}},
+        {"max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_q_len}},
+        {"max_kv_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_kv_len}},
+        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
+    };
+
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output", {MEMORY_GPU, dtype, {bsz, max_input_len, hidden_units_}, context_decoder_output_buf}},
+        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
+        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
+        {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, deocder_output}}};
+
+    context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
+
+    if (tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("context decoding end");
+    }
+}
+
+template<typename T>
+void LlamaV2<T>::decoderForward(T*         decoder_output,
+                                uintptr_t* k_cache_ptr,
+                                uintptr_t* v_cache_ptr,
+                                T*         decoder_input,
+                                const int* sequence_length,
+                                const int* total_padding_count,
+                                bool*      finished,
+                                int        step,
+                                int        ite,
+                                size_t     session_len,
+                                size_t     batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    const int  max_seq_len = session_len;
+    const auto dtype       = getTensorType<T>();
+
+    // max_input_length is not used w/o linear_bias_slopes
+    // sequence_lengths_ will be incremented in dynamic decode
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_input}},
+        {"sequence_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
+        {"total_padding_tokens", {MEMORY_GPU, TYPE_INT32, {batch_size}, total_padding_count}},
+        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
+        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
+        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
+        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
+        {"ite", {MEMORY_CPU, TYPE_INT32, {1}, &ite}},
+    };
+
+    // LOG(ERROR) << key_cache_ << " " << value_cache_;
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_output}},
+        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, k_cache_ptr}},
+        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, v_cache_ptr}},
+    };
+
+    decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
+}
+
+template<typename T>
+void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cudaDataType_t data_type = getCudaDataType<T>();
+    float          alpha     = 1.f;
+    float          beta      = 0.f;
+    if (tensor_para_.world_size_ == 1) {
+        cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                              CUBLAS_OP_N,
+                              vocab_size_,  // n
+                              batch_size,
+                              hidden_units_,  // k
+                              &alpha,
+                              weights_->post_decoder_embedding_kernel,
+                              data_type,
+                              hidden_units_,  // k
+                              decoder_output,
+                              data_type,
+                              hidden_units_,  // k
+                              &beta,
+                              logits,
+                              CUDA_R_32F,
+                              vocab_size_,  // n
+                              CUDA_R_32F,
+                              cublasGemmAlgo_t(-1));
+    }
+    else {
+        FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
+        const size_t local_vocab_size = vocab_size_ / tensor_para_.world_size_;
+        cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                              CUBLAS_OP_N,
+                              local_vocab_size,  // n
+                              batch_size,
+                              hidden_units_,  // k
+                              &alpha,
+                              weights_->post_decoder_embedding_kernel
+                                  + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                              data_type,
+                              hidden_units_,  // k
+                              decoder_output,
+                              data_type,
+                              hidden_units_,  // k
+                              &beta,
+                              local_logits + tensor_para_.rank_ * batch_size * local_vocab_size,
+                              CUDA_R_32F,
+                              local_vocab_size,  // n
+                              CUDA_R_32F,
+                              cublasGemmAlgo_t(-1));
+        {
+            NcclGuard nccl_guard(tensor_para_, stream_);
+            ftNcclAllGather(local_logits,                   // send_buf
+                            local_logits,                   // recv_buf
+                            batch_size * local_vocab_size,  // data_size
+                            tensor_para_.rank_,
+                            tensor_para_,
+                            stream_);
+        }
+        invokeTransposeAxis01(logits, local_logits, tensor_para_.world_size_, batch_size, local_vocab_size, stream_);
+        sync_check_cuda_error();
+    }
+}
+
+template<typename T>
+void LlamaV2<T>::dynamicDecode(int*            token_ids,
+                               bool*           finished,
+                               int*            sequence_length,
+                               bool*           should_stop,
+                               TensorMap*      inputs,
+                               TensorMap*      outputs,
+                               const float*    logits,
+                               const uint32_t* seq_limit_len,
+                               const int*      context_length,
+                               const int*      end_ids,
+                               int             step,
+                               int             ite,
+                               size_t          max_context_len,
+                               size_t          token_ids_len,
+                               size_t          batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    int local_batch_size = (int)batch_size;
+
+    std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
+        {"logits", {MEMORY_GPU, TYPE_FP32, {batch_size, (size_t)1, vocab_size_}, logits}},
+        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
+        {"max_input_length", {MEMORY_CPU, TYPE_INT32, {1}, &max_context_len}},
+        {"sequence_limit_length", {MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size, 1}, context_length}},
+        {"ite", {MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+        {"end_id", {MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
+        {"local_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+    };
+
+    const std::vector<std::string> optional_inputs{"stop_words_list",
+                                                   "bad_words_list",
+                                                   "runtime_top_k",
+                                                   "runtime_top_p",
+                                                   "temperature",
+                                                   "repetition_penalty",
+                                                   "random_seed"};
+    for (const auto& key : optional_inputs) {
+        if (inputs->isExist(key)) {
+            dynamic_decode_input_tensors.insert({key, inputs->at(key)});
+        }
+    }
+
+    std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
+        {"output_ids", {MEMORY_GPU, TYPE_INT32, {token_ids_len, batch_size, 1U}, token_ids}},
+        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
+        {"sequence_length", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
+        {"should_stop", {MEMORY_CPU, TYPE_BOOL, {1}, should_stop}}};
+
+    const std::vector<std::string> optional_outputs{"cum_log_probs", "output_log_probs"};
+    for (const auto& key : optional_outputs) {
+        if (outputs->isExist(key)) {
+            dynamic_decode_output_tensors.insert({key, outputs->at(key)});
+        }
+    }
+
+    dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+}
+
+template<typename T>
+void LlamaV2<T>::internalThreadEntry(int device_id)
+{
+    FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
+    check_cuda_error(cudaSetDevice(device_id));
+
+    auto& request_queue  = shared_state_->request_queue;
+    auto& infer_requests = shared_state_->infer_requests;
+    auto& stop_requests  = shared_state_->stop_requests;
+
+    while (1) {
+        if (tensor_para_.rank_ == 0) {
+            const int  free_slot_count = batch_.maxSize() - batch_.size() + batch_.finishedCount();
+            const bool is_empty        = free_slot_count == batch_.maxSize();
+
+            request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty);
+
+            batch_.verifyRequests(stop_requests, infer_requests);
+        }
+
+        // wait while rank-0 is dequeueing
+        shared_state_->barrier->wait();
+
+        bool modified = false;
+
+        if (!(batch_.finishedCount() == 0 && stop_requests.empty() && infer_requests.empty())) {
+            batch_.handleStopRequests(stop_requests);
+            batch_.synchronize();
+            modified = true;
+        }
+
+        const int infer_request_count = infer_requests.size();
+
+        if (!infer_requests.empty()) {
+            batch_.initialize(infer_requests);  // reinitialize when new requests come, possible buffer allocation
+            batch_.contextDecode();
+            modified = true;
+        }
+
+        // wait while shared stop/infer_requests is being used
+        shared_state_->barrier->wait();
+
+        if (batch_.size()) {
+            if (modified) {
+                batch_.initializeGeneration();
+                batch_.initializeSampling(infer_request_count);
+            }
+            for (int i = 0; i < step_length_; ++i) {
+                if (!batch_.generate()) {
+                    break;
+                }
+            }
+            batch_.finish();
+        }
+    }
+
+    FT_CHECK(0);
+}
+
+template<typename T>
+void LlamaV2<T>::start()
+{
+    int device_id = -1;
+    check_cuda_error(cudaGetDevice(&device_id));
+    internal_thread_ = std::thread(&LlamaV2<T>::internalThreadEntry, this, device_id);
+}
+
+static inline Tensor slice(const Tensor& tensor, int index)
+{
+    auto shape = tensor.shape;
+    if (shape.at(0) == 1) {
+        return tensor;
+    }
+    shape[0]          = 1;
+    const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
+    return tensor.slice(shape, offset);
+}
+
+// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
+static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
+{
+    TensorMap dst;
+    for (const auto& kv : src) {
+        dst.insert({kv.first, slice(kv.second, index)});
+    }
+    return dst;
+}
+
+template<typename T>
+void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
+                         const std::unordered_map<std::string, Tensor>* inputs,
+                         Control                                        control)
+{
+    if (debug_) {
+        if (tensor_para_.rank_ == 0) {
+            for (const auto& kv : *inputs) {
+                FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+            }
+            for (const auto& kv : *outputs) {
+                FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+            }
+        }
+    }
+
+    const int batch_size = outputs->at("output_ids").shape[0];
+
+    const auto rank = tensor_para_.rank_;
+
+    std::vector<std::shared_ptr<Request>> requests(batch_size);
+
+    // rank-0 allocates all requests for the batch
+    if (rank == 0) {
+        for (int i = 0; i < batch_size; ++i) {
+            requests[i] = std::make_shared<Request>();
+            requests[i]->inputs.resize(tensor_para_.world_size_);
+            requests[i]->outputs.resize(tensor_para_.world_size_);
+        }
+        control.comm->setSharedObject(&requests);
+    }
+
+    control.comm->barrier();
+
+    if (rank != 0) {
+        requests = *(std::vector<std::shared_ptr<Request>>*)control.comm->getSharedObject();
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+        auto& r = requests[i];
+
+        r->inputs[rank]  = slice(*inputs, i);
+        r->outputs[rank] = slice(*outputs, i);
+
+        if (rank == 0) {
+            r->id         = r->inputs[rank].getVal<uint64_t>("CORRID", i);
+            r->start_flag = r->inputs[rank].getVal<int>("START", 1);
+            r->end_flag   = r->inputs[rank].getVal<int>("END", 1);
+            r->stop_flag  = r->inputs[rank].getVal<int>("STOP", 0);
+            r->stream_cb  = control.callback;
+        }
+    }
+
+    control.comm->barrier();
+
+    // rank-0 now takes the ownership of `requests`
+    // rank-0 submits the tasks and wait for finish
+    std::vector<int> error_codes;
+    bool             has_error = 0;
+    if (rank == 0) {
+        FT_LOG_INFO("[forward] Enqueue requests");
+        auto futures = shared_state_->request_queue.enqueue(std::move(requests));
+
+        FT_LOG_INFO("[forward] Wait for requests to complete ...");
+        for (auto& f : futures) {
+            auto ec = f.get();
+            error_codes.push_back(ec);
+            if (ec) {
+                has_error = true;
+            }
+        }
+    }
+
+    // prevents request tensors being freed before the batch completes
+    control.comm->barrier();
+
+    if (rank == 0 && has_error) {
+        std::stringstream ss;
+        for (int i = 0; i < error_codes.size(); ++i) {
+            ss << (i ? "" : " ") << error_codes[i];
+        }
+        throw std::runtime_error(ss.str());
+    }
+}
+
+template class LlamaV2<half>;
+template class LlamaV2<float>;
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaV2.h
+++ b/src/fastertransformer/models/llama/LlamaV2.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+
+#pragma once
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/models/llama/Barrier.h"
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <unordered_map>
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaV2 {
+public:
+    struct SharedState {
+        std::vector<std::shared_ptr<Request>> infer_requests;
+        std::vector<std::shared_ptr<Request>> stop_requests;
+        RequestQueue                          request_queue;
+        std::shared_ptr<Barrier>              barrier;
+    };
+
+    ~LlamaV2();
+
+    LlamaV2(size_t                       head_num,
+            size_t                       size_per_head,
+            size_t                       inter_size,
+            size_t                       num_layer,
+            size_t                       vocab_size,
+            size_t                       rotary_embedding_dim,
+            float                        norm_eps,
+            int                          max_batch_size,
+            int                          max_context_token_num,
+            int                          session_len,
+            int                          step_length,
+            int                          start_id,
+            int                          end_id,
+            int                          cache_max_entry_count,
+            int                          cache_chunk_size,
+            bool                         use_context_fmha,
+            std::shared_ptr<SharedState> shared_state,
+            LlamaWeight<T>*              weights,
+            NcclParam                    tensor_para,
+            cudaStream_t                 stream,
+            cublasMMWrapper*             cublas_wrapper,
+            IAllocator*                  allocator,
+            bool                         is_free_buffer_after_forward,
+            cudaDeviceProp*              cuda_device_prop);
+
+    struct Control {
+        AbstractInstanceComm* comm;
+        Request::Callback     callback;
+    };
+
+    void forward(std::unordered_map<std::string, Tensor>*       outputs,
+                 const std::unordered_map<std::string, Tensor>* inputs,
+                 Control                                        control);
+
+    void stop(const std::vector<uint64_t>& seq_ids);
+
+private:
+    friend class Batch;
+
+    void internalThreadEntry(int device_id);
+
+    void initialize(bool use_context_fmha);
+
+    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
+
+    void contextDecode(T*         deocder_output,
+                       uintptr_t* k_cache_ptr,
+                       uintptr_t* v_cache_ptr,
+                       T*         context_decoder_input_buf,
+                       T*         context_decoder_output_buf,
+                       const int* input_ids,
+                       const int* input_length,
+                       const int* history_length,
+                       const int* context_length,
+                       size_t     token_num,
+                       size_t     max_input_len,
+                       size_t     max_context_len,
+                       size_t     session_len,
+                       size_t     batch_size);
+
+    void decoderForward(T*         decoder_output,
+                        uintptr_t* k_cache_ptr,
+                        uintptr_t* v_cache_ptr,
+                        T*         decoder_input,
+                        const int* sequence_length,
+                        const int* total_padding_count,
+                        bool*      finished,
+                        int        step,
+                        int        ite,
+                        size_t     session_len,
+                        size_t     batch_size);
+
+    void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);
+
+    void dynamicDecode(int*            token_ids,
+                       bool*           finished,
+                       int*            sequence_length,
+                       bool*           should_stop,
+                       TensorMap*      inputs,
+                       TensorMap*      outputs,
+                       const float*    logits,
+                       const uint32_t* seq_limit_len,
+                       const int*      context_length,
+                       const int*      end_ids,
+                       int             step,
+                       int             ite,
+                       size_t          max_context_len,
+                       size_t          token_ids_len,
+                       size_t          batch_size);
+
+    void start();
+
+private:
+    friend class LlamaBatch<T>;
+
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t inter_size_;
+    const size_t num_layer_;
+    const size_t vocab_size_;
+    const size_t rotary_embedding_dim_;
+    float        rmsnorm_eps_ = 1e-6f;
+
+    static constexpr bool neox_rotary_style_ = false;
+
+    const int    start_id_;
+    const int    end_id_;
+    const size_t hidden_units_;
+
+    const size_t local_head_num_;
+    NcclParam    tensor_para_;
+
+    cudaStream_t     stream_;
+    cublasMMWrapper* cublas_wrapper_;
+    IAllocator*      allocator_;
+    bool             is_free_buffer_after_forward_;
+    cudaDeviceProp*  cuda_device_prop_;
+
+    const bool debug_{false};
+
+    std::unique_ptr<LlamaCacheManager> kv_cache_mgr_;
+
+    LlamaWeight<T>*            weights_{};
+    LlamaDecoder<T>*           decoder_{};
+    LlamaContextDecoder<T>*    context_decoder_{};
+    DynamicDecodeLayer<float>* dynamic_decode_layer_{};
+
+    const int                    step_length_;
+    LlamaBatch<T>                batch_;
+    std::shared_ptr<SharedState> shared_state_;
+
+    std::thread internal_thread_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
+
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaWeight<T>::LlamaWeight(size_t     hidden_units,
+                            size_t     inter_size,
+                            size_t     vocab_size,
+                            size_t     num_layer,
+                            WeightType weight_type,
+                            size_t     tensor_para_size,
+                            size_t     tensor_para_rank,
+                            int        prefix_cache_len):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    vocab_size_(vocab_size),
+    num_layer_(num_layer),
+    weight_type_(weight_type),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    prefix_cache_len_(prefix_cache_len)
+{
+    decoder_layer_weights.reserve(num_layer_);
+    for (unsigned l = 0; l < num_layer_; ++l) {
+        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
+            hidden_units_, inter_size_, weight_type_, tensor_para_size_, tensor_para_rank_));
+    }
+
+    mallocWeights();
+}
+
+template<typename T>
+LlamaWeight<T>::~LlamaWeight()
+{
+    cudaFree((void*)pre_decoder_embedding_table);
+    cudaFree((void*)output_norm_weight);
+    cudaFree((void*)post_decoder_embedding_kernel);
+
+    if (prefix_cache_key) {
+        cudaFree((void*)prefix_cache_key);
+        cudaFree((void*)prefix_cache_token);
+    }
+
+    pre_decoder_embedding_table   = nullptr;
+    post_decoder_embedding_kernel = nullptr;
+
+    prefix_cache_token = nullptr;
+    prefix_cache_key   = nullptr;
+    prefix_cache_value = nullptr;
+}
+
+template<typename T>
+void LlamaWeight<T>::mallocWeights()
+{
+    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_ * hidden_units_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_);
+
+    if (prefix_cache_len_) {
+        size_t cache_size = num_layer_ * prefix_cache_len_ * hidden_units_ / tensor_para_size_;
+        deviceMalloc((T**)&prefix_cache_key, cache_size * 2);
+        prefix_cache_value = prefix_cache_key + cache_size;
+        deviceMalloc((int**)&prefix_cache_token, prefix_cache_len_);
+    }
+}
+
+template<typename T>
+void LlamaWeight<T>::loadModel(std::string dir_path)
+{
+    FtCudaDataType model_file_type = FtCudaDataType::FP16;
+    dir_path += '/';
+
+    loadWeightFromBin((T*)pre_decoder_embedding_table,
+                      {vocab_size_ * hidden_units_},
+                      dir_path + "tok_embeddings.weight",
+                      model_file_type);
+
+    loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
+
+    loadWeightFromBin(
+        (T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_}, dir_path + "output.weight", model_file_type);
+
+    if (prefix_cache_len_) {
+        loadWeightFromBin((float*)prefix_cache_token, {prefix_cache_len_}, dir_path + "prefix_cache.token");
+        loadWeightFromBin((T*)prefix_cache_key,
+                          {num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
+                          dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".key",
+                          model_file_type);
+        loadWeightFromBin((T*)prefix_cache_value,
+                          {num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
+                          dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".value",
+                          model_file_type);
+    }
+
+    for (unsigned layer = 0; layer < num_layer_; ++layer) {
+        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
+    }
+}
+
+template struct LlamaWeight<float>;
+template struct LlamaWeight<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LlamaWeight {
+    LlamaWeight() = default;
+    LlamaWeight(size_t     hidden_units,
+                size_t     inter_size,
+                size_t     vocab_size,
+                size_t     num_layer,
+                WeightType weight_type,
+                size_t     tensor_para_size,
+                size_t     tensor_para_rank,
+                int        prefix_cache_len);
+
+    ~LlamaWeight();
+
+    LlamaWeight(const LlamaWeight& other) = delete;
+    LlamaWeight& operator=(const LlamaWeight& other) = delete;
+
+    void loadModel(std::string dir_path);
+
+    std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
+    const T*                                 pre_decoder_embedding_table{};
+    const T*                                 output_norm_weight{};
+    const T*                                 post_decoder_embedding_kernel{};
+
+    size_t prefix_cache_len_;
+    int*   prefix_cache_token{};
+    T*     prefix_cache_key{};
+    T*     prefix_cache_value{};
+
+private:
+    void mallocWeights();
+
+    size_t     hidden_units_;
+    size_t     inter_size_;
+    size_t     vocab_size_;
+    size_t     num_layer_;
+    WeightType weight_type_;
+    size_t     tensor_para_size_;
+    size_t     tensor_para_rank_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/Request.h
+++ b/src/fastertransformer/models/llama/Request.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include <condition_variable>
+#include <cstdint>
+#include <future>
+#include <limits>
+#include <queue>
+#include <unordered_map>
+
+namespace fastertransformer {
+
+struct Request {
+    uint64_t id;
+    bool     start_flag;
+    bool     end_flag;
+    bool     stop_flag;
+
+    // per rank inputs/outputs
+    std::vector<TensorMap> inputs;
+    std::vector<TensorMap> outputs;
+
+    using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
+    Callback stream_cb;
+
+    enum
+    {
+        kInvalid  = 1,
+        kConflict = 2,
+        kBusy     = 3,
+        kInactive = 4,
+        kFail     = 5
+    };
+    std::promise<int> signal;
+};
+
+class RequestQueue {
+public:
+    std::vector<std::future<int>> enqueue(std::vector<std::shared_ptr<Request>> requests)
+    {
+        std::vector<std::future<int>> futures;
+        futures.reserve(requests.size());
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            for (auto& r : requests) {
+                futures.push_back(r->signal.get_future());
+                if (r->stop_flag) {
+                    stop_queue_.push(std::move(r));
+                }
+                else {
+                    infer_queue_.push(std::move(r));
+                }
+            }
+        }
+        cv_.notify_one();
+        return futures;
+    }
+
+    void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
+                 std::vector<std::shared_ptr<Request>>& infer_requests,
+                 unsigned                               max_infer_count,
+                 bool                                   blocking)
+    {
+        std::unique_lock<std::mutex> lock(mutex_);
+        if (blocking) {
+            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()); });
+        }
+
+        stop_requests.clear();
+        while (!stop_queue_.empty()) {
+            stop_requests.push_back(std::move(stop_queue_.front()));
+            stop_queue_.pop();
+        }
+
+        infer_requests.clear();
+        while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) {
+            infer_requests.push_back(std::move(infer_queue_.front()));
+            infer_queue_.pop();
+        }
+    }
+
+private:
+    std::queue<std::shared_ptr<Request>> stop_queue_;
+    std::queue<std::shared_ptr<Request>> infer_queue_;
+    std::mutex                           mutex_;
+    std::condition_variable              cv_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/llama_decoder_kernels.cu
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cuda_fp16.h>
+
+namespace cg = cooperative_groups;
+
+namespace fastertransformer {
+
+template<typename T>
+struct res_norm_ops_t {};
+
+template<typename T>
+struct res_norm_t {
+    res_norm_ops_t<T> f;
+    __device__ uint4  addvec(const uint4& a, const uint4& b, float& accum) const
+    {
+        uint4 c;
+        c.x = f.cast(f.add(f.cast(a.x), f.cast(b.x), accum));
+        c.y = f.cast(f.add(f.cast(a.y), f.cast(b.y), accum));
+        c.z = f.cast(f.add(f.cast(a.z), f.cast(b.z), accum));
+        c.w = f.cast(f.add(f.cast(a.w), f.cast(b.w), accum));
+        return c;
+    }
+    __device__ uint4 normvec(const uint4& u, const uint4& s, float factor) const
+    {
+        uint4 v;
+        v.x = f.cast(f.norm(f.cast(u.x), f.cast(s.x), factor));
+        v.y = f.cast(f.norm(f.cast(u.y), f.cast(s.y), factor));
+        v.z = f.cast(f.norm(f.cast(u.z), f.cast(s.z), factor));
+        v.w = f.cast(f.norm(f.cast(u.w), f.cast(s.w), factor));
+        return v;
+    }
+};
+
+template<>
+struct res_norm_ops_t<half> {
+    __device__ float2 cast(const uint& x) const
+    {
+        return __half22float2(reinterpret_cast<const half2&>(x));
+    }
+    __device__ uint cast(const float2& x) const
+    {
+        auto y = __float22half2_rn(x);
+        return reinterpret_cast<uint&>(y);
+    }
+    __device__ float2 add(const float2& a, const float2& b, float& accum) const
+    {
+        float2 c{a.x + b.x, a.y + b.y};
+        accum += c.x * c.x + c.y * c.y;
+        return c;
+    }
+    __device__ float2 norm(const float2& a, const float2& s, float factor) const
+    {
+        return {a.x * s.x * factor, a.y * s.y * factor};
+    }
+};
+
+template<>
+struct res_norm_ops_t<float> {
+    __device__ float cast(const uint& x) const
+    {
+        return reinterpret_cast<const float&>(x);
+    }
+    __device__ uint cast(const float& x) const
+    {
+        return reinterpret_cast<const uint&>(x);
+    }
+    __device__ float add(const float& a, const float& b, float& accum) const
+    {
+        float c = a + b;
+        accum += c * c;
+        return c;
+    }
+    __device__ float norm(const float& a, const float& s, float factor) const
+    {
+        return a * s * factor;
+    }
+};
+
+template<typename T>
+__device__ T blockReduceSum(const cg::thread_block& block, T value)
+{
+    __shared__ float partial[32];
+
+    auto tile = cg::tiled_partition<32>(block);
+    value     = cg::reduce(tile, value, cg::plus<float>{});
+
+    if (tile.thread_rank() == 0) {
+        partial[tile.meta_group_rank()] = value;
+    }
+
+    block.sync();
+
+    value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
+    return cg::reduce(tile, value, cg::plus<float>{});
+}
+
+template<typename T>
+__global__ void fusedAddResidualNorm(
+    T* __restrict__ r_data, T* __restrict__ x_data, const T* __restrict__ scale, float eps, int batch_size, int n_dims)
+{
+    auto block = cg::this_thread_block();
+    auto grid  = cg::this_grid();
+
+    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
+
+    const auto b              = grid.block_rank();
+    uint4* __restrict__ r_ptr = reinterpret_cast<uint4*>(r_data + b * n_dims);
+    uint4* __restrict__ x_ptr = reinterpret_cast<uint4*>(x_data + b * n_dims);
+
+    res_norm_t<T> ops;
+
+    float thread_sum{};
+    for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+        auto r   = r_ptr[i];
+        auto x   = x_ptr[i];
+        r        = ops.addvec(r, x, thread_sum);
+        r_ptr[i] = r;
+    }
+
+    auto total_sum = blockReduceSum(block, thread_sum);
+
+    float s_inv_mean = rsqrt(total_sum / n_dims + eps);
+
+    const uint4* __restrict__ s_ptr = reinterpret_cast<const uint4*>(scale);
+    for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+        auto r   = r_ptr[i];
+        auto s   = s_ptr[i];
+        auto o   = ops.normvec(r, s, s_inv_mean);
+        x_ptr[i] = o;
+    }
+}
+
+template<typename T>
+void invokeFusedAddResidualRMSNorm(
+    T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
+{
+    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
+    FT_CHECK(n_dims % PACK_DIM == 0);
+    const int n_pack    = n_dims / PACK_DIM;
+    const int n_iter    = ((n_pack + 1023) / 1024);        // iterations when block size == 1024
+    int       n_threads = (n_pack + n_iter - 1) / n_iter;  // adjust block size to avoid tail effect
+    n_threads           = (n_threads + 31) / 32 * 32;      // round up to the nearest multiple of warp size
+
+    fusedAddResidualNorm<<<batch_size, n_threads, 0, stream>>>(residual, inout, scale, eps, batch_size, n_dims);
+}
+
+template void invokeFusedAddResidualRMSNorm(float*, float*, const float*, float, int, int, cudaStream_t);
+template void invokeFusedAddResidualRMSNorm(half*, half*, const half*, float, int, int, cudaStream_t);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/llama_decoder_kernels.h
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeFusedAddResidualRMSNorm(
+    T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/llama_kernels.cu
+++ b/src/fastertransformer/models/llama/llama_kernels.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+
+namespace fastertransformer {
+
+// fp16, bf16
+// n is divided by 2 for this impl
+template<typename T>
+__global__ void rootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n)
+{
+    using T2 = typename TypeConverter<T>::Type;
+    __shared__ float s_inv_mean;
+    float            mean = 0.f;
+
+    T2*       out_ptr   = (T2*)out;
+    const T2* input_ptr = (const T2*)input;
+    const T2* scale_ptr = (const T2*)scale;
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float2 tmp2 = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
+        mean += tmp2.x * tmp2.x;
+        mean += tmp2.y * tmp2.y;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_inv_mean = rsqrt(.5f * mean / (float)n + eps);
+    }
+    __syncthreads();
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float2 tmp2                   = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
+        float2 sca2                   = cuda_cast<float2>(scale_ptr[idx]);
+        tmp2.x                        = tmp2.x * s_inv_mean * sca2.x;
+        tmp2.y                        = tmp2.y * s_inv_mean * sca2.y;
+        out_ptr[blockIdx.x * n + idx] = cuda_cast<T2>(tmp2);
+    }
+}
+
+template<>
+__global__ void rootMeanSquareNorm(float* out, const float* input, const float* scale, float eps, int m, int n)
+{
+    __shared__ float s_inv_mean;
+    float            mean = 0.f;
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float tmp = input[blockIdx.x * n + idx];
+        mean += tmp * tmp;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_inv_mean = rsqrt(mean / static_cast<float>(n) + eps);
+    }
+    __syncthreads();
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float tmp                 = input[blockIdx.x * n + idx];
+        out[blockIdx.x * n + idx] = tmp * s_inv_mean * scale[idx];
+    }
+}
+
+template<typename T>
+void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream)
+{
+    if (sizeof(T) == 2) {
+        FT_CHECK(n % 2 == 0);
+        n /= 2;
+    }
+    dim3 grid(m);
+    dim3 block(std::min(n, 1024));
+    rootMeanSquareNorm<<<grid, block, 0, stream>>>(out, input, scale, eps, m, n);
+}
+
+template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t);
+template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t);
+
+// #ifdef ENABLE_BF16
+
+// template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
+
+// #endif
+
+template<typename T, typename T0>
+__device__ T saturate_cast(T0 x)
+{
+    return x;
+}
+
+template<>
+__device__ half saturate_cast<half, float>(float x)
+{
+    return (x > 64512.f || x < -64512.f) ? (x > 0.f ? 64512.f : -64512.f) : x;
+}
+
+template<typename T>
+__global__ void addResidual(T* out, const T* in, size_t n)
+{
+    auto idx = threadIdx.x + (size_t)blockIdx.x * blockDim.x;
+    if (idx < n) {
+        out[idx] = static_cast<T>(static_cast<float>(out[idx]) + static_cast<float>(in[idx]));
+    }
+}
+
+template<typename T>
+void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
+{
+    auto total = static_cast<size_t>(m) * n;
+    dim3 block(std::min(total, 1024UL));
+    dim3 grid((total + block.x - 1) / block.x);
+
+    addResidual<<<grid, block, 0, stream>>>(out, in, total);
+}
+
+template void invokeAddResidual(float*, const float*, int, int, cudaStream_t);
+template void invokeAddResidual(half*, const half*, int, int, cudaStream_t);
+
+// ids [seq_len, batch_size]
+// input_ids [batch_size, max_input_len]
+__global__ void
+fixInputIds(int* ids, const int* input_ids, const int* input_lengths, int batch_size, int seq_len, int max_input_len)
+{
+    int seq_id   = threadIdx.x;
+    int batch_id = blockIdx.x;
+    for (; seq_id < input_lengths[batch_id]; seq_id += blockDim.x) {
+        ids[seq_id * batch_size + batch_id] = input_ids[batch_id * max_input_len + seq_id];
+    }
+}
+
+void invokeFixInputIds(int*         ids,
+                       const int*   input_ids,
+                       const int*   input_lengths,
+                       int          batch_size,
+                       int          seq_len,
+                       int          max_input_len,
+                       cudaStream_t st)
+{
+    dim3 block(std::min(1024, max_input_len));
+    dim3 grid(batch_size);
+    fixInputIds<<<grid, block, 0, st>>>(ids, input_ids, input_lengths, batch_size, seq_len, max_input_len);
+}
+
+template<typename T>
+__global__ void sliceCausalMask(T* mask, int seq_len, int key_len, int step)
+{
+    mask += (size_t)blockIdx.x * seq_len * key_len;
+    for (int i = threadIdx.x; i < seq_len * key_len; i += blockDim.x) {
+        int row = i / key_len;
+        int col = i % key_len;
+        if (col <= row + step) {
+            mask[i] = static_cast<T>(1.f);
+        }
+        else {
+            mask[i] = static_cast<T>(0.f);
+        }
+    }
+}
+
+// [step: step+Q, :] of the K*K causal mask
+template<typename T>
+void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream)
+{
+    FT_CHECK(step == key_len - seq_len);
+    sliceCausalMask<<<batch_size, 256, 0, stream>>>(mask, seq_len, key_len, step);
+}
+
+template void invokeSliceCausalMask(half*, int, int, int, int, cudaStream_t);
+template void invokeSliceCausalMask(float*, int, int, int, int, cudaStream_t);
+
+// mask [bsz, max_q_len, max_k_len]
+
+template<typename T>
+__global__ void createCausalMasks(T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len)
+{
+    const auto q_len = q_lens[blockIdx.x];
+    const auto k_len = k_lens[blockIdx.x];
+    mask += blockIdx.x * max_q_len * max_k_len;
+    for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) {
+        const int q        = i / max_k_len;  // [0, max_q_len)
+        const int k        = i % max_k_len;  // [0, max_k_len)
+        bool      is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len);
+        mask[i]            = static_cast<T>(is_valid);
+    }
+}
+
+template<typename T>
+void invokeCreateCausalMasks(
+    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream)
+{
+    createCausalMasks<<<batch_size, 512, 0, stream>>>(mask, q_lens, k_lens, max_q_len, max_k_len);
+}
+
+template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
+template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);
+
+template<typename T>
+__global__ void extend_key_cache(T**          k_dst,
+                                 const size_t dst_offset,
+                                 const T*     k_src,
+                                 const int    head_num,
+                                 const int    size_per_head,
+                                 const int*   query_length,
+                                 const int*   history_length,
+                                 const int    max_q_len,
+                                 const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto key_src = reinterpret_cast<const uint4*>(k_src);
+    const auto key_dst = reinterpret_cast<uint4*>(k_dst[batch_id] + dst_offset);
+
+    const auto seq_len  = query_length[batch_id];
+    const auto t_offset = history_length[batch_id];
+
+    const int k_head_size_id = idx % size_per_head_div_x;
+    const int k_seq_len_id   = idx / size_per_head_div_x;
+
+    if (k_seq_len_id < seq_len) {
+        // [B, H, s, D/x] -> [H, D/x, S[t:t+s]]
+
+        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                k_head_size_id * max_seq_len +                 // D/x
+                                t_offset + k_seq_len_id;                       // s + offset
+
+        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
+                                head_id * size_per_head_div_x * max_q_len +              // H
+                                k_seq_len_id * size_per_head_div_x +                     // s
+                                k_head_size_id;                                          // D/x
+
+        key_dst[dst_idx] = key_src[src_idx];
+    }
+}
+
+template<typename T>
+__global__ void extend_value_cache(T**          v_dst,
+                                   const size_t dst_offset,
+                                   const T*     v_src,
+                                   const int    head_num,
+                                   const int    size_per_head,
+                                   const int*   query_length,
+                                   const int*   history_length,
+                                   const int    max_q_len,
+                                   const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto val_src = reinterpret_cast<const uint4*>(v_src);
+    const auto val_dst = reinterpret_cast<uint4*>(v_dst[batch_id] + dst_offset);
+
+    const auto seq_len  = query_length[batch_id];
+    const auto t_offset = history_length[batch_id];
+
+    const int v_head_size_id = idx % size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
+
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
+        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +      // H
+                                (v_seq_len_id + t_offset) * size_per_head_div_x +  // s + offset
+                                v_head_size_id;                                    // D/x
+
+        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
+                                head_id * size_per_head_div_x * max_q_len +              // H
+                                v_seq_len_id * size_per_head_div_x +                     // s
+                                v_head_size_id;                                          // D/x
+
+        val_dst[dst_idx] = val_src[src_idx];
+    }
+}
+
+template<typename T>
+void invokeExtendKVCache(T**          k_dst,
+                         T**          v_dst,
+                         size_t       dst_offset,
+                         const T*     k_src,
+                         const T*     v_src,
+                         int          local_batch_size,
+                         const int*   query_length,
+                         int          max_q_len,
+                         const int*   history_length,
+                         int          max_seq_len,
+                         int          size_per_head,
+                         int          local_head_num,
+                         cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+
+    dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+
+    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+
+    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+        v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+}
+
+template void invokeExtendKVCache(float**,
+                                  float**,
+                                  size_t,
+                                  const float*,
+                                  const float*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  int,
+                                  int,
+                                  cudaStream_t stream);
+
+template void invokeExtendKVCache(half**,
+                                  half**,
+                                  size_t,
+                                  const half*,
+                                  const half*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  int,
+                                  int,
+                                  cudaStream_t stream);
+
+template<typename T>
+__global__ void transpose_key_cache(T*           k_dst,
+                                    const T**    k_src,
+                                    const size_t src_offset,
+                                    const int    head_num,
+                                    const int    size_per_head,
+                                    const int*   seq_length,
+                                    const int    max_kv_len,
+                                    const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto key_src = reinterpret_cast<const uint4*>(k_src[batch_id] + src_offset);
+    const auto key_dst = reinterpret_cast<uint4*>(k_dst);
+
+    const auto seq_len = seq_length[batch_id];
+
+    const int k_head_size_id = idx % size_per_head_div_x;
+    const int k_seq_len_id   = idx / size_per_head_div_x;
+
+    if (k_seq_len_id < seq_len) {
+        // [B, H, s, D/x] <- [B, H, D/x, S[:s]]
+
+        const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                k_head_size_id * max_seq_len +                 // D/x
+                                k_seq_len_id;                                  // s
+
+        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
+                                head_id * size_per_head_div_x * max_kv_len +              // H
+                                k_seq_len_id * size_per_head_div_x +                      // s
+                                k_head_size_id;                                           // D/x
+
+        key_dst[dst_idx] = key_src[src_idx];
+    }
+}
+
+template<typename T>
+__global__ void transpose_value_cache(T*           v_dst,  //
+                                      const T**    v_src,
+                                      const size_t src_offset,
+                                      const int    head_num,
+                                      const int    size_per_head,
+                                      const int*   seq_length,
+                                      const int    max_kv_len,
+                                      const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
+    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
+
+    const auto seq_len = seq_length[batch_id];
+
+    const int v_head_size_id = idx % size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
+
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] <- [B, H, S[:s], D/x]
+        const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                v_seq_len_id * size_per_head_div_x +           // s
+                                v_head_size_id;                                // D/x
+
+        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
+                                head_id * size_per_head_div_x * max_kv_len +              // H
+                                v_seq_len_id * size_per_head_div_x +                      // s
+                                v_head_size_id;                                           // D/x
+
+        val_dst[dst_idx] = val_src[src_idx];
+    }
+}
+
+template<typename T>
+void invokeTransposeKVCache(T*           key_cache_trans,
+                            T*           val_cache_trans,
+                            const T**    key_cache,
+                            const T**    val_cache,
+                            size_t       src_offset,
+                            int          batch_size,
+                            const int*   key_length,
+                            int          max_kv_len,
+                            int          max_seq_len,
+                            int          size_per_head,
+                            int          head_num,
+                            cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+
+    dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
+
+    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+        key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+
+    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+        val_cache_trans, val_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+}
+
+template void invokeTransposeKVCache(
+    float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+template void invokeTransposeKVCache(
+    half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+
+__global__ void gatherOutput(int*       output_ids,
+                             const int* ids,
+                             const int* context_length,
+                             int        max_context_len,
+                             int        max_gen_step,
+                             int        max_output_len,
+                             int        batch_size)
+{
+    const int batch_id    = blockIdx.x;
+    const int context_len = context_length[batch_id];
+    output_ids += batch_id * max_output_len;
+    for (int src_idx = threadIdx.x; src_idx < max_gen_step; src_idx += blockDim.x) {
+        // skip padding for src
+        if (context_len <= src_idx && src_idx < max_context_len) {
+            continue;
+        }
+        // skip padding for dst
+        const int dst_idx   = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
+        output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+    }
+}
+
+void invokeGatherOutput(int*         output_ids,
+                        const int*   ids,
+                        const int*   context_length,
+                        int          max_context_len,
+                        int          max_gen_step,
+                        int          max_output_len,
+                        int          batch_size,
+                        cudaStream_t stream)
+{
+    int block_size = 512;
+    int grid_size  = batch_size;
+    gatherOutput<<<grid_size, block_size, 0, stream>>>(
+        output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/llama_kernels.h
+++ b/src/fastertransformer/models/llama/llama_kernels.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <numeric>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
+
+template<typename T>
+void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream);
+
+void invokeFixInputIds(int*         ids,
+                       const int*   input_ids,
+                       const int*   input_lengths,
+                       int          batch_size,
+                       int          seq_len,
+                       int          max_input_len,
+                       cudaStream_t st);
+
+template<typename T>
+void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream);
+
+template<typename T>
+void invokeCreateCausalMasks(
+    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);
+
+template<typename T>
+void invokeExtendKVCache(T**          k_dst,
+                         T**          v_dst,
+                         size_t       layer_offset,
+                         const T*     k_src,
+                         const T*     v_src,
+                         int          batch_size,
+                         const int*   query_length,
+                         int          max_q_len,
+                         const int*   history_length,
+                         int          max_seq_len,
+                         int          size_per_head,
+                         int          local_head_num,
+                         cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeKVCache(T*           key_cache_trans,
+                            T*           val_cache_trans,
+                            const T**    key_cache,
+                            const T**    val_cache,
+                            size_t       layer_offset,
+                            int          batch_size,
+                            const int*   key_length,
+                            int          max_kv_len,
+                            int          max_seq_len,
+                            int          size_per_head,
+                            int          head_num,
+                            cudaStream_t stream);
+
+void invokeGatherOutput(int*         output_ids,
+                        const int*   ids,
+                        const int*   context_length,
+                        int          max_context_len,
+                        int          max_gen_step,
+                        int          max_output_len,
+                        int          batch_size,
+                        cudaStream_t stream);
+
+void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
+
+template<typename T>
+class FlashAttentionOp {
+public:
+    struct AttentionLayout {
+        int  stride_batch;
+        int  stride_seq;
+        int  stride_head;
+        bool use_seqlens       = false;
+        int  batch_seqs_offset = 0;
+        T**  batch_seqs        = nullptr;
+    };
+
+    struct Params {
+        T*              attn_out;
+        T*              query;
+        T*              key;
+        T*              val;
+        T*              mask;
+        float*          out_accum    = nullptr;
+        int*            cu_seqlens_q = nullptr;
+        int*            cu_seqlens_k = nullptr;
+        AttentionLayout layout_q;
+        AttentionLayout layout_k;
+        AttentionLayout layout_v;
+        AttentionLayout layout_o;
+    };
+
+public:
+    FlashAttentionOp(int batch_size, int head_num, int key_len, int seq_len, int size_per_head);
+    ~FlashAttentionOp();
+
+    int get_workspace_size() const;
+
+    void operator()(Params& params, cudaStream_t st) const;
+
+private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+template<typename T>
+inline void dump(const T* x, int size, cudaStream_t st, const char* msg, bool full = false)
+{
+    std::vector<T> h_x(size);
+    cudaMemcpyAsync(h_x.data(), x, sizeof(T) * size, cudaMemcpyDefault, st);
+    cudaStreamSynchronize(st);
+    fprintf(stderr, "\n%s:\n", msg);
+    std::vector<float> h_y(h_x.begin(), h_x.end());
+    float              asum = 0.f;
+    for (const auto& x : h_y) {
+        asum += std::fabs(x);
+    }
+    if (full) {
+        for (int i = 0; i < size; ++i) {
+            printf("%d %.8f\n", i, h_y[i]);
+        }
+    }
+    else {
+        for (int i = 0; i < 8; ++i) {
+            fprintf(stderr, "%.8f\n", h_y[i]);
+        }
+        for (int i = size - 8; i < size; ++i) {
+            fprintf(stderr, "%.8f\n", h_y[i]);
+        }
+    }
+    fprintf(stderr, "\nasum = %f\n", asum);
+    // getchar();
+}
+
+template<typename T>
+struct TempBuffer {
+    TempBuffer(size_t size)
+    {
+        deviceMalloc(&data, size, false);
+    }
+    T* data;
+};
+
+template<typename T>
+inline T*
+transpose_key_cache(T* key_cache, size_t head_num, size_t size_per_head_by_x, size_t mem_len, size_t x, cudaStream_t st)
+{
+    static TempBuffer<T> buf(8192 * 8192);
+    // from: H Dx, S, x
+    // to  : S, H Dx, x
+    invokeTransposeAxis01(buf.data, key_cache, head_num * size_per_head_by_x, mem_len, x, st);
+    return buf.data;
+}
+
+template<typename T>
+inline T* transpose_value_cache(T* value_cache, size_t head_num, size_t mem_len, size_t size_per_head, cudaStream_t st)
+{
+    static TempBuffer<T> buf(8192 * 8192);
+    invokeTransposeAxis01(buf.data, value_cache, head_num, mem_len, size_per_head, st);
+    return buf.data;
+}
+
+inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st)
+{
+    int h_seq_len = -1;
+    cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
+    cudaStreamSynchronize(st);
+    FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/llama_utils.cu
+++ b/src/fastertransformer/models/llama/llama_utils.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <vector>
+
+namespace fastertransformer {
+
+CmpMode compare_mode = kCmpNone;
+
+template<typename T>
+struct abs_diff_t {
+    using type = T;
+};
+
+template<>
+struct abs_diff_t<half> {
+    using type = float;
+};
+
+template<typename T>
+struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
+    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
+    {
+        using R = typename abs_diff_t<T>::type;
+        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
+        return r < R(0) ? -r : r;
+    }
+};
+
+template<typename T>
+void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    std::vector<T> h_data(size);
+    cudaMemcpyAsync(h_data.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream);
+
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    size_t nan_cnt = 0;
+    for (const auto& x : h_data) {
+        nan_cnt += std::isnan(static_cast<float>(x));
+    }
+    if (nan_cnt) {
+        std::cerr << key << ": NaN count " << nan_cnt << "\n";
+    }
+}
+
+template<typename T>
+void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    // wait for b
+    check_cuda_error(cudaStreamSynchronize(stream));
+    // read a from file
+    thrust::host_vector<T> h_a(size);
+    {
+        const auto    filename = "tmp/" + key + ".cmp";
+        std::ifstream ifs(filename, std::ios::binary);
+        if (!ifs.is_open()) {
+            std::cerr << key << ": failed to open " + filename << "\n";
+            return;
+        }
+        ifs.seekg(0, ifs.end);
+        const auto actual_size_in_bytes = ifs.tellg();
+        ifs.seekg(0, ifs.beg);
+        const auto expect_size_in_bytes = sizeof(T) * size;
+        if (actual_size_in_bytes != expect_size_in_bytes) {
+            std::cerr << key << ": file size in bytes mismatch, expect " << expect_size_in_bytes << ", got "
+                      << actual_size_in_bytes << "\n";
+            return;
+        }
+        ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
+    }
+    // copy a to device
+    thrust::device_vector<T> a = h_a;
+    // create abs(a - b) iterator
+    thrust::device_ptr<T> dev_ptr(ptr);
+    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
+    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
+    // sum(abs(a - b))
+    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
+    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+}
+
+template<typename T>
+void CmpWrite(T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    std::vector<T> a(size);
+    // copy a to host
+    check_cuda_error(cudaMemcpyAsync(a.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+    // write to file
+    {
+        std::ofstream ofs("tmp/" + key + ".cmp", std::ios::binary);
+        ofs.write((char*)a.data(), sizeof(T) * a.size());
+    }
+}
+
+template<typename T>
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream)
+{
+    // std::cerr << "Comparing " << key << "\n";
+    if (mode == kCmpRead) {
+        CmpRead(ptr, size, key, stream);
+    }
+    else if (mode == kCmpWrite) {
+        CmpWrite(ptr, size, key, stream);
+    }
+    else {
+        // kCmpNone
+    }
+}
+
+template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+
+template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream);
+template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream);
+
+std::string format(const std::pair<std::string, Tensor>& p)
+{
+    std::stringstream ss;
+    ss << p.first << " [";
+    bool first = true;
+    for (const auto& x : p.second.shape) {
+        ss << (first ? "" : ", ") << x;
+        first = false;
+    }
+    ss << "]";
+    return ss.str();
+}
+
+size_t curandStateGetSize()
+{
+    return sizeof(curandState_t);
+}
+
+bool isDebug()
+{
+    static const bool is_debug = [] {
+        const auto level = std::getenv("FT_DEBUG_LEVEL");
+        if (level && level == std::string("DEBUG")) {
+            return true;
+        }
+        return false;
+    }();
+    return is_debug;
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/llama_utils.h
+++ b/src/fastertransformer/models/llama/llama_utils.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+#include "src/fastertransformer/utils/Tensor.h"
+#include <cuda_runtime.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace fastertransformer {
+
+enum CmpMode
+{
+    kCmpNone,
+    kCmpRead,
+    kCmpWrite,
+};
+
+extern CmpMode compare_mode;
+
+template<typename T>
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+
+template<typename T>
+void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream);
+
+namespace detail {
+
+template<typename T>
+std::string to_string(T x)
+{
+    return std::to_string(x);
+}
+
+inline std::string to_string(std::string x)
+{
+    return x;
+}
+
+}  // namespace detail
+
+template<typename... Args>
+std::string Concat(std::string key, Args&&... args)
+{
+    std::vector<std::string> args_str{detail::to_string((Args &&) args)...};
+    for (const auto& s : args_str) {
+        key.append("_");
+        key.append(s);
+    }
+    return key;
+}
+
+std::string format(const std::pair<std::string, Tensor>& p);
+
+size_t curandStateGetSize();
+
+bool isDebug();
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/models/llama/prefix_cache.cu
+++ b/src/fastertransformer/models/llama/prefix_cache.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/prefix_cache.h"
+
+// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
+template<typename T>
+__global__ void insertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, size_t S)
+{
+    for (int i = threadIdx.x; i < L * H * Dx * s * X; i += blockDim.x) {
+        int i0 = i / X;
+        int x  = i % X;
+
+        int i1 = i0 / s;
+        int t  = i0 % s;
+
+        size_t j     = (i1 * S + t) * X + x;
+        key_cache[j] = src[i];
+    }
+}
+
+template<typename T>
+void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st)
+{
+    insertKeyCache<<<1, 512, 0, st>>>(key_cache, src, L, H, Dx, s, X, S);
+}
+template void
+invokeInsertKeyCache(float* key_cache, const float* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+template void
+invokeInsertKeyCache(half* key_cache, const half* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+
+// <L,H,s,D> -> <L,H,S[:s],D>
+template<typename T>
+__global__ void insertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, size_t S)
+{
+    for (int i = threadIdx.x; i < L * H * s * D; i += blockDim.x) {
+        int i0 = i / D;
+        int d  = i % D;
+
+        int i1 = i0 / s;
+        int t  = i0 % s;
+
+        size_t j       = (i1 * S + t) * D + d;
+        value_cache[j] = src[i];
+    }
+}
+
+template<typename T>
+void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st)
+{
+    insertValueCache<<<1, 512, 0, st>>>(value_cache, src, L, H, s, D, S);
+}
+template void
+invokeInsertValueCache(float* value_cache, const float* src, int L, int H, int s, int D, int S, cudaStream_t st);
+template void
+invokeInsertValueCache(half* value_cache, const half* src, int L, int H, int s, int D, int S, cudaStream_t st);
--- a/src/fastertransformer/models/llama/prefix_cache.h
+++ b/src/fastertransformer/models/llama/prefix_cache.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_fp16.h>
+
+template<typename T>
+void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+
+template<typename T>
+void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
\ No newline at end of file
--- a/src/fastertransformer/triton_backend/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/CMakeLists.txt
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.18)
+
+project(tritonfastertransformerbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
+set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
+
+set(TRITON_BACKEND_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/backend repo")
+set(TRITON_CORE_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_COMMON_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/common repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(USE_TRITONSERVER_DATATYPE "ON")
+message("-- Enable USE_TRITONSERVER_DATATYPE")
+
+#
+# Dependencies
+#
+# FetchContent's composability isn't very good. We must include the
+# transitive closure of all repos so that we can override the tag.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# CUDA
+#
+if(${TRITON_ENABLE_GPU})
+  find_package(CUDAToolkit REQUIRED)
+endif() # TRITON_ENABLE_GPU
+
+#
+# Shared library implementing the Triton Backend API
+#
+configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)
+
+add_library(
+  triton-fastertransformer-backend SHARED
+  libfastertransformer.cc
+)
+
+add_library(
+  TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend
+)
+
+find_package(CUDAToolkit REQUIRED)
+find_package(CUDA 10.1 REQUIRED)
+if (${CUDA_VERSION} GREATER_EQUAL 11.0)
+  message(STATUS "Add DCUDA11_MODE")
+  add_definitions("-DCUDA11_MODE")
+endif()
+
+set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+
+target_compile_definitions(triton-fastertransformer-backend
+  PUBLIC
+  USE_TRITONSERVER_DATATYPE
+  BUILD_MULTI_GPU)
+
+target_include_directories(
+  triton-fastertransformer-backend
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${TRITON_PYTORCH_INCLUDE_PATHS}
+  ${Python3_INCLUDE_DIRS}
+  ${repo-ft_SOURCE_DIR}
+  ${repo-ft_SOURCE_DIR}/3rdparty/cutlass/include
+  ${repo-core_SOURCE_DIR}/include
+  )
+
+target_link_directories(
+  triton-fastertransformer-backend
+  PRIVATE
+  ${CUDA_PATH}/lib64
+  )
+
+target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14)
+
+target_compile_options(
+  triton-fastertransformer-backend PRIVATE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
+)
+
+if(${TRITON_ENABLE_GPU})
+  target_compile_definitions(
+    triton-fastertransformer-backend
+    PRIVATE TRITON_ENABLE_GPU=1
+  )
+endif() # TRITON_ENABLE_GPU
+
+set_target_properties(
+  triton-fastertransformer-backend
+  PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_fastertransformer
+    SKIP_BUILD_RPATH TRUE
+    BUILD_WITH_INSTALL_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE
+    INSTALL_RPATH "$\{ORIGIN\}"
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_fastertransformer.ldscript
+    LINK_FLAGS "-Wl,--no-as-needed,--version-script libtriton_fastertransformer.ldscript"
+)
+
+# Need to turn off unused-but-set-variable due to Torchvision
+# Need to turn off unknown-pragmas due to ATen OpenMP
+set_target_properties(
+  triton-fastertransformer-backend
+  PROPERTIES COMPILE_FLAGS
+    "-Wno-unknown-pragmas -Wno-unused-but-set-variable"
+)
+
+set(TRITON_PYTORCH_LDFLAGS "")
+FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
+  set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}")
+ENDFOREACH(p)
+
+target_link_libraries(
+  triton-fastertransformer-backend
+  PRIVATE
+    triton-core-serverapi  # from repo-core
+    triton-core-backendapi # from repo-core
+    triton-core-serverstub # from repo-core
+    triton-backend-utils   # from repo-backend
+    transformer-shared     # from repo-ft
+    ${TRITON_PYTORCH_LDFLAGS}
+    -lcublas
+    -lcublasLt
+    -lcudart
+    -lcurand
+)
+
+if (BUILD_MULTI_GPU)
+  target_compile_definitions(
+    triton-fastertransformer-backend
+    PUBLIC
+      BUILD_MULTI_GPU
+  )
+  target_include_directories(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${MPI_INCLUDE_PATH}
+  )
+  target_link_directories(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${MPI_Libraries}
+      /usr/local/mpi/lib
+  )
+  target_link_libraries(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${NCCL_LIBRARIES}
+      ${MPI_LIBRARIES}
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  target_link_libraries(
+    triton-fastertransformer-backend
+    PRIVATE
+      CUDA::cudart
+  )
+endif() # TRITON_ENABLE_GPU
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend)
+
+install(
+  TARGETS
+    triton-fastertransformer-backend
+  EXPORT
+    triton-fastertransformer-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+)
+
+install(
+  EXPORT
+    triton-fastertransformer-backend-targets
+  FILE
+    TritonFasterTransformerBackendTargets.cmake
+  NAMESPACE
+    TritonFasterTransformerBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-fastertransformer-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake
+  NAMESPACE TritonFasterTransformerBackend::
+)
+
+export(PACKAGE TritonFasterTransformerBackend)
+
+
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
+target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
+
+add_subdirectory(llama)
--- a/src/fastertransformer/triton_backend/libfastertransformer.cc
+++ b/src/fastertransformer/triton_backend/libfastertransformer.cc
+// Copyright (c) OpenMMLab. All rights reserved.
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
+
+#include <stdint.h>
+
+#include <exception>
+#include <string>
+#include <thread>
+#include <vector>
+
+#pragma GCC diagnostic push
+//#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wcast-function-type"
+#pragma warning(push, 0)
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+
+// must include triton libraries first
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/core/tritonbackend.h"
+
+// FT's libraries have dependency with triton's lib
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+std::exception_ptr ptr[8];
+
+namespace ft = fastertransformer;
+
+namespace triton {
+namespace backend {
+namespace fastertransformer_backend {
+
+#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X)                                                 \
+    do {                                                                                                               \
+        TRITONSERVER_Error* raarie_err__ = (X);                                                                        \
+        if (raarie_err__ != nullptr) {                                                                                 \
+            SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__);                                           \
+            return;                                                                                                    \
+        }                                                                                                              \
+    } while (false)
+
+// Cuda Error handling
+TRITONSERVER_Error*
+ConvertCUDAStatusToTritonError(cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
+{
+    if (cuda_error != cudaSuccess) {
+        return TRITONSERVER_ErrorNew(code, cudaGetErrorString(cuda_error));
+    }
+    return nullptr;  // success
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Ragged Baching
+
+struct RaggedBatchingParams {
+    bool           is_input_ragged      = false;
+    int32_t        max_seq_length       = 0;
+    int32_t        max_elements_per_seq = 0;
+    const int32_t* batch_input_ptr      = nullptr;
+    size_t         batch_intput_size    = 0;
+    size_t         total_input_elements = 0;
+};
+
+using RaggedBatchingParam_Map = std::unordered_map<std::string, RaggedBatchingParams>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model.
+//
+class ModelState: public BackendModel {
+public:
+    static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state);
+    virtual ~ModelState() = default;
+
+    TRITONSERVER_Error* LoadModel(const std::string&                                                 artifact_name,
+                                  const int32_t                                                      node_id,
+                                  const int32_t                                                      device_id,
+                                  const int32_t                                                      device_id_start,
+                                  const int32_t                                                      stream_id,
+                                  std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params,
+                                  std::shared_ptr<ft::AbstractCustomComm>            custom_all_reduce_comms,
+                                  std::string*                                       model_path,
+                                  std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance);
+
+    int GetGpuSize()
+    {
+        return gpu_size;
+    };
+    int GetWorldSize()
+    {
+        return world_size;
+    };
+    int GetParallelSize()
+    {
+        return tp_pp_size;
+    };
+    int GetInstanceId()
+    {
+        return current_model_instance_id++;
+    };
+    int GetInstanceGroupCount()
+    {
+        return instance_group_count;
+    };
+    bool SequenceBatchingEnabled()
+    {
+        return sequence_batching_enabled;
+    };
+    bool DynamicBatchingEnabled()
+    {
+        return dynamic_batching_enabled;
+    };
+    std::shared_ptr<AbstractTransformerModel> GetFtModel()
+    {
+        return ft_model;
+    };
+
+private:
+    ModelState(TRITONBACKEND_Model* triton_model);
+    TRITONSERVER_Error*                       AutoCompleteConfig();
+    std::string                               GetParameter(const char* parameter);
+    int                                       current_model_instance_id = 0;
+    bool                                      sequence_batching_enabled = false;
+    bool                                      dynamic_batching_enabled  = false;
+    int                                       instance_group_count      = 1;
+    std::shared_ptr<AbstractTransformerModel> ft_model;
+    int                                       node_id, gpu_size, world_size, tp_pp_size;
+    std::vector<cudaStream_t>                 streams_;
+
+    std::shared_ptr<AbstractTransformerModel> ModelFactory(common::TritonJson::Value& param,
+                                                           const std::string&         model_filename);
+};
+
+TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+    try {
+        *state = new ModelState(triton_model);
+    }
+    catch (const BackendModelException& ex) {
+        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
+                             TRITONSERVER_ERROR_INTERNAL,
+                             std::string("unexpected nullptr in BackendModelException"));
+        RETURN_IF_ERROR(ex.err_);
+    }
+
+    // Auto-complete the configuration if requested, or T5-Encoder
+    bool auto_complete_config = false;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(triton_model, &auto_complete_config));
+    auto_complete_config |=
+        (*state)->GetParameter("model_type") == "T5-Encoder" || (*state)->GetParameter("model_type") == "bert";
+    if (auto_complete_config) {
+        RETURN_IF_ERROR((*state)->AutoCompleteConfig());
+
+        triton::common::TritonJson::WriteBuffer json_buffer;
+        (*state)->ModelConfig().Write(&json_buffer);
+
+        TRITONSERVER_Message* message;
+        RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(&message, json_buffer.Base(), json_buffer.Size()));
+        RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(triton_model, 1 /* config_version */, message));
+    }
+
+    return nullptr;  // success
+}
+
+std::string param_get(common::TritonJson::Value& param, const char* field, const std::string& fallback = "")
+{
+    common::TritonJson::Value key;
+    std::string               value = fallback;
+    param.MemberAsObject(field, &key);
+    key.MemberAsString("string_value", &value);
+    return value;
+}
+
+int param_get_int(common::TritonJson::Value& param, const char* field, int fallback = 0)
+{
+    int ret = fallback;
+    try {
+        ret = std::stoi(param_get(param, field));
+    }
+    catch (std::invalid_argument& ia) {
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
+                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
+    }
+    return ret;
+}
+
+float param_get_float(common::TritonJson::Value& param, const char* field, float fallback = 0.0)
+{
+    float ret = fallback;
+    try {
+        ret = std::stof(param_get(param, field));
+    }
+    catch (std::invalid_argument& ia) {
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
+                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
+    }
+    return ret;
+}
+
+bool param_get_bool(common::TritonJson::Value& param, const char* field, bool fallback = false)
+{
+    return static_cast<bool>(param_get_int(param, field, static_cast<int>(fallback)));
+}
+
+std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(common::TritonJson::Value& param,
+                                                                   const std::string&         model_filename)
+{
+    std::shared_ptr<AbstractTransformerModel> ft_model;
+
+    const std::string model_dir = param_get(
+        param, "model_checkpoint_path", JoinPath({RepositoryPath(), std::to_string(Version()), model_filename}));
+    const std::string model_type = param_get(param, "model_type", "GPT");
+    const std::string data_type  = param_get(param, "data_type");
+    const int         tp         = param_get_int(param, "tensor_para_size");
+    const int         pp         = param_get_int(param, "pipeline_para_size");
+    const int         custom_ar  = param_get_int(param, "enable_custom_all_reduce");
+
+    const std::string dt_message = std::string("Invalid configuration argument 'data_type': ") + data_type;
+
+    if (model_type == "Llama") {
+        if (data_type == "fp16") {
+            ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir);
+        }
+        else {
+            ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir);
+        }
+    }
+    else {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, ("Unknown model \"" + model_type + "\"").c_str()));
+    }
+
+    return ft_model;
+}
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model): BackendModel(triton_model, true)
+{
+    node_id       = ft::mpi::getCommWorldRank();
+    int num_nodes = ft::mpi::getCommWorldSize();
+
+    triton::common::TritonJson::WriteBuffer buffer;
+    ModelConfig().PrettyWrite(&buffer);
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("model configuration:\n") + buffer.Contents()).c_str());
+
+    common::TritonJson::Value param;
+    model_config_.MemberAsObject("parameters", &param);
+
+    // instance groups
+    triton::common::TritonJson::Value instance_group, instance_obj, instance_group_count_val, instance_group_kind;
+    if (!ModelConfig().Find("instance_group", &instance_group) || instance_group.ArraySize() > 1) {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Only supports one instance group !"));
+    }
+    instance_group.IndexAsObject(0, &instance_obj);
+    instance_obj.Find("count", &instance_group_count_val);
+    instance_obj.Find("kind", &instance_group_kind);
+    std::string instance_group_kind_str;
+    int64_t     instance_group_count_int64 = 1;
+    instance_group_kind.AsString(&instance_group_kind_str);
+    instance_group_count_val.AsInt(&instance_group_count_int64);
+    instance_group_count = (int)instance_group_count_int64;
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        ("Instance group type: " + instance_group_kind_str + " count: " + std::to_string(instance_group_count_int64))
+            .c_str());
+    if (instance_group_kind_str != "KIND_CPU") {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Instance Group: only KIND_CPU supports!"));
+    }
+
+    // instance group validation
+    bool multi_node_enabled  = num_nodes > 1;
+    tp_pp_size               = param_get_int(param, "tensor_para_size") * param_get_int(param, "pipeline_para_size");
+    gpu_size                 = ft::getDeviceCount();
+    world_size               = gpu_size * num_nodes;
+    int  model_instance_size = num_nodes > 1 ? gpu_size : tp_pp_size;
+    bool multi_model_instance_valid = (multi_node_enabled && tp_pp_size == world_size && instance_group_count == 1)
+                                      || (!multi_node_enabled && gpu_size % tp_pp_size == 0
+                                          && model_instance_size * instance_group_count >= gpu_size);
+
+    printf("num_nodes=%d\n", num_nodes);
+    printf("tp_pp_size=%d\n", tp_pp_size);
+    printf("gpu_size=%d\n", gpu_size);
+    printf("world_size=%d\n", world_size);
+    printf("model_instance_size=%d\n", model_instance_size);
+    if (!multi_model_instance_valid) {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+                                  "1. Number of visible GPUs must be evenly divisble by TP * PP \n"
+                                  "2. Number of visible GPUs must be <= instance count * TP * PP \n"
+                                  "3. Multi-Node Inference only support one model instance \n"));
+    }
+
+    int64_t max_batch_size = 0;
+    model_config_.MemberAsInt("max_batch_size", &max_batch_size);
+
+    // sequence batching
+    triton::common::TritonJson::Value sequence_batching;
+    sequence_batching_enabled         = ModelConfig().Find("sequence_batching", &sequence_batching);
+    std::string sequence_batching_log = sequence_batching_enabled ? "enabled" : "disabled";
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Sequence Batching: ") + sequence_batching_log).c_str());
+    // if (sequence_batching_enabled && max_batch_size != 1) {
+    //   THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+    //     "Sequence Batching for interactive text generation: only supports max
+    //     batch size = 1 currently !"));
+    // }
+
+    // dynamic batching
+    triton::common::TritonJson::Value dynamic_batching;
+    dynamic_batching_enabled         = ModelConfig().Find("dynamic_batching", &dynamic_batching);
+    std::string dynamic_batching_log = dynamic_batching_enabled ? "enabled" : "disabled";
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Dynamic Batching: ") + dynamic_batching_log).c_str());
+    if (dynamic_batching_enabled && sequence_batching_enabled) {
+        THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+                                                           "Sequence Batching cannot work with dynamic "
+                                                           "batching at the same time !"));
+    }
+
+    std::string model_filename;
+    model_config_.MemberAsString("default_model_filename", &model_filename);
+
+    if (model_filename == "") {
+        model_filename = std::to_string(param_get_int(param, "tensor_para_size")) + "-gpu";
+    }
+
+    ft_model = ModelFactory(param, model_filename);
+
+    std::cout << ft_model->toString();
+
+    int total_weight_gpu_size = (instance_group_count * model_instance_size) >= gpu_size ?
+                                    gpu_size :
+                                    (instance_group_count * model_instance_size);
+    streams_.resize(instance_group_count * model_instance_size);
+
+    /* create shared weights
+    assume 8 gpus, 8 model instances, Tensor Para Size 2
+    then we will distribute model instances to [0, 1], [2, 3], [4, 5], [6, 7],
+    [0, 1], [2, 3], [4, 5], [6, 7] GPUs;
+    two instance instances on GPUs [0, 1] will share the same weights
+    */
+    std::vector<std::thread> threads;
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Weights:")).c_str());
+    ft::print_mem_usage();
+    for (int gid = 0; gid < total_weight_gpu_size; gid++) {
+        int rank = node_id * gpu_size + gid % tp_pp_size;
+        threads.push_back(std::thread(&AbstractTransformerModel::createSharedWeights, ft_model, gid, rank));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Weights:")).c_str());
+    ft::print_mem_usage();
+}
+
+TRITONSERVER_Error*
+ModelState::LoadModel(const std::string&                                                 artifact_name,
+                      const int32_t                                                      node_id,
+                      const int32_t                                                      device_id,
+                      const int32_t                                                      device_id_start,
+                      const int32_t                                                      stream_id,
+                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params_instance,
+                      std::shared_ptr<ft::AbstractCustomComm>                            custom_all_reduce_comms,
+                      std::string*                                                       model_path,
+                      std::unique_ptr<AbstractTransformerModelInstance>*                 ft_model_instance)
+{
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
+                 "Failed to set cuda device");
+
+    std::string cc_model_filename = artifact_name;
+    if (cc_model_filename.empty()) {
+        cc_model_filename = "gpt3-model";
+    }
+
+    if (!node_id && !device_id) {
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Model:")).c_str());
+    }
+    ft::print_mem_usage();
+
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaStreamCreate(&streams_[stream_id]),
+                                                TRITONSERVER_ERROR_INTERNAL,
+                                                "Failed to create the stream"),
+                 "Failed to create the stream");
+
+    const int rank = node_id * GetGpuSize() + device_id - device_id_start;
+
+    auto model_instance = ft_model->createModelInstance(
+        device_id, rank, streams_[stream_id], nccl_params_instance, custom_all_reduce_comms);
+    ft_model_instance->reset(model_instance.release());
+
+    if (!node_id && !device_id) {
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Model:")).c_str());
+    }
+    ft::print_mem_usage();
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* ModelState::AutoCompleteConfig()
+{
+    if (GetParameter("model_type") == "T5-Encoder") {
+        const std::string         data_type = GetParameter("data_type");
+        auto&                     config    = ModelConfig();
+        common::TritonJson::Value outputs, output, dtype_object;
+        std::string               name;
+        config.MemberAsArray("output", &outputs);
+
+        std::unordered_map<std::string, std::string> return_type_map{
+            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
+
+        std::set<std::string> outputs_to_modify = {"output_hidden_state", "output_attentions"};
+        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
+            outputs.IndexAsObject(idx, &output);
+            output.MemberAsString("name", &name);
+            if (outputs_to_modify.find(name) == outputs_to_modify.end()) {
+                continue;
+            }
+            output.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for \"" + name + "\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+    }
+    else if (GetParameter("model_type") == "bert") {
+        const std::string         data_type = GetParameter("data_type");
+        auto&                     config    = ModelConfig();
+        common::TritonJson::Value inputs, input, dtype_object;
+        common::TritonJson::Value outputs, output;
+        std::string               name;
+        config.MemberAsArray("input", &inputs);
+        config.MemberAsArray("output", &outputs);
+
+        std::unordered_map<std::string, std::string> return_type_map{
+            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
+
+        for (size_t idx = 0; idx < inputs.ArraySize(); idx++) {
+            inputs.IndexAsObject(idx, &input);
+            input.MemberAsString("name", &name);
+            if (name != "input_hidden_state") {
+                continue;
+            }
+            input.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for "
+                         "\"input_hidden_state\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+
+        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
+            outputs.IndexAsObject(idx, &output);
+            output.MemberAsString("name", &name);
+            if (name != "output_hidden_state") {
+                continue;
+            }
+            output.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for "
+                         "\"output_hidden_state\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+    }
+    else {
+        // Auto-complete configuration is not supported since fastertransformer does
+        // not store/capture sufficient model metadata so just log error instead.
+        LOG_MESSAGE(TRITONSERVER_LOG_WARN,
+                    (std::string("skipping model configuration auto-complete for '") + Name()
+                     + "': not supported for fastertransformer backend")
+                        .c_str());
+    }
+
+    return nullptr;  // success
+}
+
+std::string ModelState::GetParameter(const char* parameter)
+{
+    auto&                     config = ModelConfig();
+    common::TritonJson::Value parameters, model_type_obj;
+    std::string               model_type;
+    config.MemberAsObject("parameters", &parameters);
+    parameters.MemberAsObject(parameter, &model_type_obj);
+    model_type_obj.MemberAsString("string_value", &model_type);
+    return model_type;
+}
+
+struct stream_callback_ctx_t {
+    size_t                                       total_batch_size;
+    TRITONBACKEND_Request**                      requests;
+    uint32_t                                     request_count;
+    std::vector<TRITONBACKEND_Response*>*        responses;
+    std::vector<TRITONBACKEND_ResponseFactory*>* factories;
+    BackendModelInstance*                        model;
+};
+
+void generate_response_placeholders(std::vector<TRITONBACKEND_Response*>*        responses,
+                                    std::vector<TRITONBACKEND_ResponseFactory*>* factories)
+{
+    TRITONSERVER_Error* err = nullptr;
+    for (auto factory : *factories) {
+        TRITONBACKEND_Response* response;
+        err = TRITONBACKEND_ResponseNewFromFactory(&response, factory);
+        if (err) {
+            LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response from factory");
+            TRITONSERVER_ErrorDelete(err);
+        }
+        responses->push_back(response);
+    }
+}
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState: public BackendModelInstance {
+public:
+    static TRITONSERVER_Error*
+    Create(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
+    virtual ~ModelInstanceState();
+
+    // Get the state of the model that corresponds to this instance.
+    ModelState* StateForModel() const
+    {
+        return model_state_;
+    }
+
+    // Execute...
+    void ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+    std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
+            stream_callback_ctx_t*                                   context,
+            const uint32_t                                           response_count,
+            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors);
+
+    void ReadOutputTensors(size_t                                                   total_batch_size,
+                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
+                           TRITONBACKEND_Request**                                  requests,
+                           const uint32_t                                           request_count,
+                           std::vector<TRITONBACKEND_Response*>*                    responses);
+
+    int GetModelInstanceCount()
+    {
+        return model_instance_count_;
+    };
+    int GetModelInstanceId()
+    {
+        return model_instance_id_;
+    };
+
+private:
+    ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
+    TRITONSERVER_Error* ValidateInputs();
+    TRITONSERVER_Error* ValidateOutputs();
+
+    void SetInputTensors(size_t                                                    total_batch_size,
+                         TRITONBACKEND_Request**                                   requests,
+                         const uint32_t                                            request_count,
+                         std::vector<TRITONBACKEND_Response*>*                     responses,
+                         BackendInputCollector*                                    collector,
+                         std::vector<const char*>*                                 input_names,
+                         std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
+                         std::vector<BackendMemory*>*                              input_memories,
+                         bool*                                                     cuda_copy);
+
+    void BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors);
+
+    ModelState* model_state_;
+
+    // model instance id
+    int model_instance_count_           = 1;
+    int model_instance_id_              = 0;
+    int model_instance_gpu_size_        = 1;
+    int model_instance_device_id_start_ = 0;
+
+    // output tensor stream
+    cudaStream_t output_stream_;
+
+    // tensor parallel + pipeline parallel
+    int gpu_size_   = 1;
+    int world_size_ = 1;
+    int tp_pp_size_ = 1;
+
+    // Should we use the streaming API?
+    bool is_decoupled_ = false;
+
+    // The full path to the FT model file.
+    std::string model_path_;
+
+    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> ft_model_instance_;
+
+    std::unique_ptr<ft::AbstractInstanceComm> instance_comm_;
+
+    // inter-node broadcast buffer
+    std::vector<char*> bcast_buffers;
+
+    // Map from configuration name for an input to the index of
+    // that input in the model.
+    std::unordered_map<std::string, int> input_index_map_;
+
+    // Map from configuration name for an output to the index of
+    // that output in the model.
+    std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
+
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params_;
+
+    // custom all reduce comms
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms_;
+};
+
+TRITONSERVER_Error* ModelInstanceState::Create(ModelState*                  model_state,
+                                               TRITONBACKEND_ModelInstance* triton_model_instance,
+                                               ModelInstanceState**         state)
+{
+    try {
+        *state = new ModelInstanceState(model_state, triton_model_instance);
+    }
+    catch (const BackendModelInstanceException& ex) {
+        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
+                             TRITONSERVER_ERROR_INTERNAL,
+                             std::string("unexpected nullptr in BackendModelInstanceException"));
+        RETURN_IF_ERROR(ex.err_);
+    }
+
+    return nullptr;  // success
+}
+
+int ThreadLoadModel(ModelState*                                                       model_state,
+                    const std::string&                                                artifact_name,
+                    const int32_t                                                     node_id,
+                    const int32_t                                                     device_id,
+                    const int32_t                                                     device_id_start,
+                    const int32_t                                                     stream_id,
+                    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comms,
+                    std::string*                                                      model_path,
+                    std::unique_ptr<AbstractTransformerModelInstance>*                ft_model_instance)
+{
+    THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(artifact_name,
+                                                           node_id,
+                                                           device_id,
+                                                           device_id_start,
+                                                           stream_id,
+                                                           nccl_params,
+                                                           custom_all_reduce_comms,
+                                                           model_path,
+                                                           ft_model_instance));
+    return 0;
+}
+
+ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance):
+    BackendModelInstance(model_state, triton_model_instance), model_state_(model_state)
+{
+    int node_id   = ft::mpi::getCommWorldRank();
+    int num_nodes = ft::mpi::getCommWorldSize();
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Model name ") + ArtifactFilename()).c_str());
+
+    triton::common::TritonJson::Value transaction_policy;
+    is_decoupled_ = false;
+    model_state_->ModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy);
+    transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Use ") + (is_decoupled_ ? "DECOUPLED (streaming)" : "COUPLED (classic)") + " API.").c_str());
+
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs());
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
+
+    // NOTE:  model instance params
+    model_instance_id_    = model_state->GetInstanceId();
+    model_instance_count_ = model_state->GetInstanceGroupCount();
+    tp_pp_size_           = model_state->GetParallelSize();
+    gpu_size_             = model_state->GetGpuSize();
+    world_size_           = model_state->GetWorldSize();
+
+    model_instance_gpu_size_ = num_nodes > 1 ? gpu_size_ : tp_pp_size_;
+    ft_model_instance_.resize(model_instance_gpu_size_);
+    std::vector<std::thread> threads;
+
+    std::shared_ptr<AbstractTransformerModel> shared_ft_model = model_state->GetFtModel();
+
+    // NOTE: CPU_KIND only, the backend fully controls how to distribute models to
+    // GPUs
+    model_instance_device_id_start_ = (model_instance_id_ * model_instance_gpu_size_) % gpu_size_;
+    // create output tensor stream
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaSetDevice(model_instance_device_id_start_),
+                                                TRITONSERVER_ERROR_INTERNAL,
+                                                "Failed to set cuda device"),
+                 "Failed to set cuda device");
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaStreamCreate(&output_stream_), TRITONSERVER_ERROR_INTERNAL, "Failed to create the stream"),
+                 "Failed to create the stream");
+
+    // create nccl params
+    nccl_params_ = shared_ft_model->createNcclParams(node_id, model_instance_device_id_start_, num_nodes > 1);
+
+    shared_ft_model->createCustomComms(&custom_all_reduce_comms_, world_size_);
+    std::string model_instance_gpu_ids = "[ ";
+    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+         gid++) {
+        model_instance_gpu_ids += (std::to_string(gid) + " ");
+        threads.push_back(std::thread(ThreadLoadModel,
+                                      model_state,
+                                      ArtifactFilename(),
+                                      node_id,
+                                      gid,
+                                      model_instance_device_id_start_,
+                                      model_instance_id_ * model_instance_gpu_size_ + gid,
+                                      nccl_params_,
+                                      custom_all_reduce_comms_[gid - model_instance_device_id_start_],
+                                      &model_path_,
+                                      &ft_model_instance_[gid - model_instance_device_id_start_]));
+    }
+    model_instance_gpu_ids += "]";
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    instance_comm_ = shared_ft_model->createInstanceComm(tp_pp_size_);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("Model instance is created on GPU ") + model_instance_gpu_ids).c_str());
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+#ifdef TRITON_ENABLE_GPU
+#endif  // TRITON_ENABLE_GPU
+    for (auto bcast_buffer : bcast_buffers) {
+        free(bcast_buffer);
+    }
+}
+
+TRITONSERVER_Error* ModelInstanceState::ValidateInputs()
+{
+    triton::common::TritonJson::Value ios, bios;
+    // input
+    std::string                       name, data_type;
+    triton::common::TritonJson::Value jshape;
+    // batch input
+    std::string                       kind, target_name, source_input;
+    triton::common::TritonJson::Value target_name_array, source_input_array;
+    model_state_->ModelConfig().MemberAsArray("input", &ios);
+    model_state_->ModelConfig().MemberAsArray("batch_input", &bios);
+
+    std::vector<std::string> valid_batch_input;
+
+    // batch input
+    for (size_t size = 0; size < bios.ArraySize(); size++) {
+        triton::common::TritonJson::Value batch_input;
+        bios.IndexAsObject(size, &batch_input);
+        batch_input.MemberAsString("kind", &kind);
+        batch_input.MemberAsArray("target_name", &target_name_array);
+        batch_input.MemberAsString("data_type", &data_type);
+        batch_input.MemberAsArray("source_input", &source_input_array);
+        target_name_array.IndexAsString(0, &target_name);
+        source_input_array.IndexAsString(0, &source_input);
+
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                    (std::string("Get batch input kind: " + kind + ", target_name: " + target_name
+                                 + ", data_type: " + data_type + ", source_input: " + source_input)
+                         .c_str()));
+
+        if (kind == "BATCH_ITEM_SHAPE" && data_type == "TYPE_INT32" && source_input + "_item_shape" == target_name) {
+            valid_batch_input.emplace_back(std::move(source_input));
+        }
+    }
+
+    // input
+    for (size_t size = 0; size < ios.ArraySize(); size++) {
+        triton::common::TritonJson::Value input;
+        ios.IndexAsObject(size, &input);
+        input.MemberAsString("name", &name);
+        input.MemberAsString("data_type", &data_type);
+        input.MemberAsArray("dims", &jshape);
+
+        triton::common::TritonJson::Value allow_ragged_batch_json;
+        bool                              allow_ragged_batch = false;
+        if (input.Find("allow_ragged_batch", &allow_ragged_batch_json)) {
+            RETURN_IF_ERROR(allow_ragged_batch_json.AsBool(&allow_ragged_batch));
+        }
+
+        if (allow_ragged_batch
+            && std::find(valid_batch_input.begin(), valid_batch_input.end(), name) == valid_batch_input.end()) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INVALID_ARG,
+                std::string("Ragged Batch [ " + name + " ] needs the corresponding batch_input item shape !").c_str());
+        }
+
+        std::vector<int64_t> shape;
+        for (size_t size = 0; size < jshape.ArraySize(); size++) {
+            int64_t value = 0;
+            jshape.IndexAsInt(size, &value);
+            shape.push_back(value);
+        }
+
+        std::string str_shape = "[";
+        for (uint i = 0; i < shape.size(); i++) {
+            str_shape = str_shape + std::to_string(shape[i]);
+            if (i != shape.size() - 1) {
+                str_shape = str_shape + ", ";
+            }
+            else {
+                str_shape = str_shape + "]";
+            }
+        }
+
+        std::string allow_ragged_batch_str = allow_ragged_batch ? "true" : "false";
+
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                    (std::string("Get input name: " + name + ", type: " + data_type + ", shape: " + str_shape
+                                 + ", allow_ragged_batch: " + allow_ragged_batch_str)
+                         .c_str()));
+    }
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* ModelInstanceState::ValidateOutputs()
+{
+    triton::common::TritonJson::Value ios;
+    RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
+
+    std::string                       name, data_type;
+    triton::common::TritonJson::Value jshape;
+    model_state_->ModelConfig().MemberAsArray("output", &ios);
+    for (size_t size = 0; size < ios.ArraySize(); size++) {
+        triton::common::TritonJson::Value input;
+        ios.IndexAsObject(size, &input);
+        input.MemberAsString("name", &name);
+        input.MemberAsString("data_type", &data_type);
+        input.MemberAsArray("dims", &jshape);
+
+        std::vector<int64_t> shape;
+        for (size_t size = 0; size < jshape.ArraySize(); size++) {
+            int64_t value = 0;
+            jshape.IndexAsInt(size, &value);
+            shape.push_back(value);
+        }
+
+        std::string str_shape = "[";
+        for (uint i = 0; i < shape.size(); i++) {
+            str_shape = str_shape + std::to_string(shape[i]);
+            if (i != shape.size() - 1) {
+                str_shape = str_shape + ", ";
+            }
+            else {
+                str_shape = str_shape + "]";
+            }
+        }
+
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("Get output name: " + name + ", type: " + data_type + ", shape: " + str_shape).c_str()));
+    }
+
+    return nullptr;  // success
+}
+
+void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + std::to_string(request_count)
+                 + " requests")
+                    .c_str());
+    uint64_t exec_start_ns = 0;
+    SET_TIMESTAMP(exec_start_ns);
+
+    const int max_batch_size = model_state_->MaxBatchSize();
+
+    // For each request collect the total batch size for this inference
+    // execution. The batch-size, number of inputs, and size of each
+    // input has already been checked so don't need to do that here.
+    size_t total_batch_size = 0;
+
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    // size_t real_batch_dim = (int) sequence_batching_enabled;
+    constexpr size_t real_batch_dim = 0;
+
+    // only one batch slot per model instance when sequence_batching enabled
+    for (size_t i = 0; i < request_count; i++) {
+        // If we get a nullptr request then something is badly wrong. Fail
+        // and release all requests.
+        if (requests[i] == nullptr) {
+            RequestsRespondWithError(
+                requests,
+                request_count,
+                TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_INTERNAL,
+                    std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str()));
+            return;
+        }
+
+        if (max_batch_size > 0) {
+            // Retrieve the batch size from one of the inputs, if the model
+            // supports batching, the first dimension size is batch size
+            int index = 0;
+            while (true) {
+                TRITONBACKEND_Input* input;
+                TRITONSERVER_Error*  err_0 = TRITONBACKEND_RequestInputByIndex(requests[i], index, &input);
+                if (err_0 == nullptr) {
+                    const char*         input_name;
+                    const int64_t*      shape;
+                    TRITONSERVER_Error* err_1 =
+                        TRITONBACKEND_InputProperties(input, &input_name, nullptr, &shape, nullptr, nullptr, nullptr);
+                    std::string input_name_str = std::string(input_name);
+                    if (err_1 == nullptr) {
+                        if (input_name_str != "START" && input_name_str != "END" && input_name_str != "READY") {
+                            total_batch_size += shape[real_batch_dim];
+                            break;
+                        }
+                        index++;
+                    }
+                    else {
+                        RequestsRespondWithError(requests, request_count, err_1);
+                        return;
+                    }
+                }
+                else {
+                    RequestsRespondWithError(requests, request_count, err_0);
+                    return;
+                }
+            }
+        }
+        else {
+            total_batch_size += 1;
+        }
+    }
+
+    // If there are no valid payloads then no need to run the inference.
+    if (total_batch_size == 0) {
+        return;
+    }
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("get total batch_size = ") + std::to_string(total_batch_size)).c_str());
+
+    // Make sure the maximum batch size is not exceeded. The
+    // total_batch_size must be 1 for models that don't support batching
+    // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+    // scheduler has done something badly wrong so fail and release all
+    // requests.
+    if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
+        RequestsRespondWithError(
+            requests,
+            request_count,
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
+                                  std::string("batch size " + std::to_string(total_batch_size) + " for '" + Name()
+                                              + "', max allowed is " + std::to_string(max_batch_size))
+                                      .c_str()));
+        return;
+    }
+
+    // At this point we are committed to running inference with all
+    // 'requests'. Create a response for each request. During input
+    // processing if there is an error with any request that error will
+    // be sent immediately with the corresponding response (and the
+    // response unique_ptr will then be nullptr). The request object
+    // itself will not be released until after all inferencing is done
+    // (below) as we may need to access the request object when
+    // determine how to process outputs (for example, even if we don't
+    // need the outputs for a request that has an error, we do need to
+    // know the size of those outputs associated with the request so we
+    // can skip them in the output tensors).
+    //
+    // When operating in the decoupled mode, responses should be created
+    // from factories. Here, we instantiate a factory for each request and
+    // generate the first response. At each new result from the model the
+    // generated response is filled, sent, and another response is created
+    // from the factory. The last response is send just like in the
+    // non-decoupled mode.
+    std::vector<TRITONBACKEND_Response*> responses;
+    responses.reserve(request_count);
+    std::vector<TRITONBACKEND_ResponseFactory*> factories;
+
+    for (size_t i = 0; i < request_count; i++) {
+        if (is_decoupled_) {
+            TRITONBACKEND_ResponseFactory* factory;
+            auto                           err = TRITONBACKEND_ResponseFactoryNew(&factory, requests[i]);
+            if (err == nullptr) {
+                factories.emplace_back(factory);
+            }
+            else {
+                factories.emplace_back(nullptr);
+                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response factory");
+                TRITONSERVER_ErrorDelete(err);
+            }
+        }
+        else {
+            TRITONBACKEND_Response* response;
+            auto                    err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+            if (err == nullptr) {
+                responses.emplace_back(response);
+            }
+            else {
+                responses.emplace_back(nullptr);
+                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+                TRITONSERVER_ErrorDelete(err);
+            }
+        }
+    }
+
+    std::vector<const char*>                                 input_names;
+    std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors =
+        std::make_shared<std::unordered_map<std::string, Tensor>>();
+    std::vector<BackendMemory*> input_memories;
+    bool                        cuda_copy = false;
+    if (is_decoupled_) {
+        generate_response_placeholders(&responses, &factories);
+    }
+    BackendInputCollector collector(requests,
+                                    request_count,
+                                    &responses,
+                                    model_state_->TritonMemoryManager(),
+                                    model_state_->EnablePinnedInput(),
+                                    CudaStream());
+    SetInputTensors(total_batch_size,
+                    requests,
+                    request_count,
+                    &responses,
+                    &collector,
+                    &input_names,
+                    &input_tensors,
+                    &input_memories,
+                    &cuda_copy);
+
+    // Wait for any in-flight input tensor copies to complete.
+#ifdef TRITON_ENABLE_GPU
+    if (cuda_copy) {
+        cudaStreamSynchronize(CudaStream());
+    }
+#endif
+
+    uint64_t compute_start_ns = 0;
+    SET_TIMESTAMP(compute_start_ns);
+
+    stream_callback_ctx_t context = {total_batch_size, requests, request_count, &responses, &factories, this};
+
+    auto output_tensors = Execute(&responses, &context, request_count, input_tensors);
+
+    uint64_t compute_end_ns = 0;
+    SET_TIMESTAMP(compute_end_ns);
+
+    // Free BackendMemory used for inputs
+    for (BackendMemory* mem : input_memories) {
+        delete mem;
+    }
+    input_memories.clear();
+
+    ReadOutputTensors(total_batch_size, output_tensors, requests, request_count, &responses);
+
+    uint64_t exec_end_ns = 0;
+    SET_TIMESTAMP(exec_end_ns);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("get response size = ") + std::to_string(responses.size())).c_str());
+
+    // Send all the responses that haven't already been sent because of
+    // an earlier error. Note that the responses are not set to nullptr
+    // here as we need that indication below to determine if the request
+    // we successful or not.
+    for (auto& response : responses) {
+        if (response != nullptr) {
+            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+                         "failed to send FasterTransformer backend response");
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
+        }
+        else {
+            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("response is nullptr")).c_str());
+        }
+    }
+
+    // Report statistics for each request.
+    for (uint32_t r = 0; r < request_count; ++r) {
+        auto& request = requests[r];
+        LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportStatistics(TritonModelInstance(),
+                                                                 request,
+                                                                 (responses[r] != nullptr) /* success */,
+                                                                 exec_start_ns,
+                                                                 compute_start_ns,
+                                                                 compute_end_ns,
+                                                                 exec_end_ns),
+                     "failed reporting request statistics");
+
+        LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+                     "failed releasing request");
+    }
+
+    // Report the entire batch statistics.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            TritonModelInstance(), total_batch_size, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting batch request statistics");
+}
+
+void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, void* ctx)
+{
+    stream_callback_ctx_t* context = reinterpret_cast<stream_callback_ctx_t*>(ctx);
+    ModelInstanceState*    model   = reinterpret_cast<ModelInstanceState*>(context->model);
+
+    std::vector<TRITONBACKEND_Response*>* responses = context->responses;
+
+    model->ReadOutputTensors(
+        context->total_batch_size, output_tensors, context->requests, context->request_count, responses);
+
+    for (auto& response : *responses) {
+        if (response != nullptr) {
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
+            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
+                         "failed to send FasterTransformer backend response");
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
+        }
+        else {
+            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("streaming response is nullptr")).c_str());
+        }
+    }
+    responses->clear();
+    generate_response_placeholders(responses, context->factories);
+}
+
+int ThreadForward(std::unique_ptr<AbstractTransformerModelInstance>*        ft_model_instance,
+                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
+                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* output_tensors,
+                  ft::AbstractInstanceComm*                                 instance_comm,
+                  std::exception_ptr*                                       exception_ptr,
+                  const int                                                 device_id,
+                  const int                                                 use_stream_cb,
+                  stream_callback_ctx_t*                                    context)
+{
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
+                 "Failed to set cuda device");
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Start to forward")).c_str());
+    if (use_stream_cb) {
+        (*ft_model_instance)->registerCallback(streaming_callback, (void*)context);
+    }
+    *output_tensors = (*ft_model_instance)->forward(*input_tensors, instance_comm);
+    if (use_stream_cb) {
+        (*ft_model_instance)->unRegisterCallback();
+    }
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Stop to forward")).c_str());
+
+    if ((*output_tensors)->count("error_message")) {
+        *exception_ptr = *((std::exception_ptr*)((*output_tensors)->at("error_message").data));
+    }
+    return 0;
+}
+
+void triton_check_inputs(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, const char* filename)
+{
+    auto& output = output_tensors->at("output_ids");
+    auto  shape  = output.shape;
+    assert(shape.size() == 3);
+    assert(output.type == TYPE_UINT32);
+    auto        batch_size = shape[0];
+    auto        length     = shape[2];
+    std::string fName      = filename;
+    auto        file       = std::ofstream(fName, std::ios::out);
+    if (!file.is_open()) {}
+    else {
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t j = 0; j < length; j++) {
+                file << ((uint32_t*)output.data)[i * length + j] << " ";
+            }
+            file << std::endl;
+        }
+    }
+}
+
+void ModelInstanceState::BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    uint32_t input_count = node_id ? 0 : (*input_tensors)->size();
+    ft::mpi::bcast(&input_count, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+    if (input_count > bcast_buffers.size()) {
+        bcast_buffers.resize(input_count);
+    }
+
+    if (node_id) {
+        for (uint input_index = 0; input_index < input_count; input_index++) {
+            std::vector<size_t> batchn_shape;
+            int64_t             shape_size  = 0;
+            int64_t             buffer_size = 1;
+            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            for (int s_id = 0; s_id < shape_size; s_id++) {
+                int64_t val;
+                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+                batchn_shape.push_back(val);
+                buffer_size *= val;
+            }
+            int64_t data_type_size = 1;
+            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            buffer_size *= data_type_size;
+            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], buffer_size);
+            char* input_buffer         = bcast_buffers[input_index];
+            ft::mpi::bcast(input_buffer, buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+
+            int64_t name_size = 0;
+            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            char char_name[1024] = {0};
+            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
+            uint32_t data_type_num = 0;
+            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+            TRITONSERVER_DataType triton_data_type = TRITONSERVER_DataType(data_type_num);
+
+            (*input_tensors)
+                ->insert({std::string(char_name),
+                          Tensor{TRITONSERVER_MEMORY_CPU, triton_data_type, batchn_shape, input_buffer}});
+        }
+    }
+    else {
+        int input_index = 0;
+        for (auto it = (*input_tensors)->begin(); it != (*input_tensors)->end(); ++it) {
+            std::vector<size_t> batchn_shape = it->second.shape;
+            int64_t             shape_size   = batchn_shape.size();
+            int64_t             buffer_size  = 1;
+            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            for (int s_id = 0; s_id < shape_size; s_id++) {
+                int64_t val = batchn_shape[s_id];
+                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+                buffer_size *= val;
+            }
+
+            ft::Tensor tmp{
+                ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, nullptr};  // TODO change the getDataTypeByteNum function to static
+            int64_t data_type_size = tmp.getTypeSize(triton::Tensor::convertTritonTypeToFt(it->second.type));
+            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            buffer_size *= data_type_size;
+
+            ft::mpi::bcast(
+                const_cast<void*>(it->second.data), buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+
+            std::string name      = it->first;
+            int64_t     name_size = name.size();
+            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], name_size);
+            char*   char_name          = bcast_buffers[input_index];
+            int64_t length             = (int64_t)name.copy(char_name, name_size);
+            ft::FT_CHECK(length == name_size);
+            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
+
+            uint32_t data_type_num = (uint32_t)(it->second.type);
+            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+            input_index++;
+        }
+    }
+}
+
+std::shared_ptr<std::unordered_map<std::string, Tensor>>
+ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
+                            stream_callback_ctx_t*                                   context,
+                            const uint32_t                                           response_count,
+                            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    if (node_id == 0) {
+        // Debug: input array
+        // triton_check_inputs(input_tensors, "triton_in");
+    }
+    if (node_id) {
+        input_tensors = std::make_shared<std::unordered_map<std::string, Tensor>>();
+    }
+
+    ft::mpi::barrier();
+
+    BroadcastInputTensors(&input_tensors);
+    std::vector<std::thread>                                 threads;
+    std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors_list[model_instance_gpu_size_];
+    std::exception_ptr                                       exception_ptr[model_instance_gpu_size_];
+    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+         gid++) {
+        int instance_local_id = gid - model_instance_device_id_start_;
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("before ThreadForward " + std::to_string(gid))).c_str());
+        threads.push_back(std::thread(ThreadForward,
+                                      &ft_model_instance_[instance_local_id],
+                                      &input_tensors,
+                                      &output_tensors_list[instance_local_id],
+                                      instance_comm_.get(),
+                                      &exception_ptr[instance_local_id],
+                                      gid,
+                                      is_decoupled_ && gid == model_instance_device_id_start_,
+                                      context));
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("after ThreadForward " + std::to_string(gid))).c_str());
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    try {
+        for (int gid = model_instance_device_id_start_;
+             gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+             gid++) {
+            int instance_local_id = gid - model_instance_device_id_start_;
+            if (exception_ptr[instance_local_id]) {
+                std::rethrow_exception(exception_ptr[instance_local_id]);
+            }
+        }
+    }
+    catch (std::exception& ex) {
+        SendErrorForResponses(
+            responses,
+            response_count,
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
+                                  ("FasterTransformer execute failure: " + std::string(ex.what())).c_str()));
+    }
+    auto output_tensors = output_tensors_list[0];
+    return output_tensors;
+}
+
+void ModelInstanceState::SetInputTensors(
+    size_t                                                            total_batch_size,
+    TRITONBACKEND_Request**                                           requests,
+    const uint32_t                                                    request_count,
+    std::vector<TRITONBACKEND_Response*>*                             responses,
+    BackendInputCollector*                                            collector,
+    std::vector<const char*>*                                         input_names,
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* input_tensors,
+    std::vector<BackendMemory*>*                                      input_memories,
+    bool*                                                             cuda_copy)
+{
+    const int max_batch_size = model_state_->MaxBatchSize();
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    bool dynamic_batching_enabled = model_state_->DynamicBatchingEnabled() || model_state_->SequenceBatchingEnabled();
+
+    // All requests must have equally-sized input tensors so use any
+    // request as the representative for the input tensors.
+    uint32_t input_count;
+    RESPOND_ALL_AND_RETURN_IF_ERROR(
+        responses, request_count, TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("get input count = ") + std::to_string(input_count)).c_str());
+
+    // Process batch input if any
+    RaggedBatchingParam_Map batch_input_param_map;
+
+    if (dynamic_batching_enabled) {
+        // Handle batch inputs for ragged batching
+        for (const auto& batch_input : model_state_->BatchInputs()) {
+            std::vector<int64_t> shape;
+            collector->BatchInputShape(batch_input, &shape);
+
+            auto batch_input_kind = batch_input.BatchInputKind();
+            auto batch_input_name = batch_input.TargetNames()[0];
+
+            // we only take care of the ragged input_ids
+            // Assume the first dimention (length) are different and others are the
+            // same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
+            // batch dimension)]
+            if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
+                && (batch_input_name == "input_ids_item_shape"
+                    || batch_input_name == "request_prompt_embedding_item_shape")) {
+                RaggedBatchingParams param{};
+
+                size_t                  num_feature_dimensions = (size_t)shape[1];
+                const char*             dst_buffer             = nullptr;
+                size_t                  dst_buffer_byte_size;
+                TRITONSERVER_MemoryType dst_memory_type;
+                int64_t                 dst_memory_type_id;
+
+                // Batch inputs are always created on CPU
+                RESPOND_ALL_AND_SET_NULL_IF_ERROR((*responses),
+                                                  responses->size(),
+                                                  collector->ProcessBatchInput(batch_input,
+                                                                               nullptr,
+                                                                               0,
+                                                                               {{TRITONSERVER_MEMORY_CPU, 0}},
+                                                                               &dst_buffer,
+                                                                               &dst_buffer_byte_size,
+                                                                               &dst_memory_type,
+                                                                               &dst_memory_type_id));
+
+                param.batch_input_ptr = reinterpret_cast<const int32_t*>(dst_buffer);
+
+                // concat all feature dimensions
+                param.batch_intput_size = (dst_buffer_byte_size / sizeof(int32_t)) / num_feature_dimensions;
+                if (num_feature_dimensions > 1) {
+                    BackendMemory* batch_item_shape_memory;
+                    RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                                    request_count,
+                                                    BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                          {BackendMemory::AllocationType::CPU},
+                                                                          0,
+                                                                          dst_buffer_byte_size / num_feature_dimensions,
+                                                                          &batch_item_shape_memory));
+                    int32_t* batch_item_shape_memory_ptr =
+                        reinterpret_cast<int32_t*>(batch_item_shape_memory->MemoryPtr());
+                    for (size_t idx = 0; idx < param.batch_intput_size; idx++) {
+                        int32_t concat_dimensions = 1;
+                        for (size_t dim_idx = 0; dim_idx < num_feature_dimensions; dim_idx++) {
+                            concat_dimensions *= param.batch_input_ptr[idx * num_feature_dimensions + dim_idx];
+                            // dim0 is seq length dimension
+                            if (dim_idx == 0) {
+                                param.max_seq_length =
+                                    std::max(param.max_seq_length, param.batch_input_ptr[idx * num_feature_dimensions]);
+                            }
+                        }
+                        batch_item_shape_memory_ptr[idx] = concat_dimensions;
+                    }
+                    param.batch_input_ptr = reinterpret_cast<const int32_t*>(batch_item_shape_memory_ptr);
+                }
+                else {
+                    param.max_seq_length =
+                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
+                }
+
+                // check if padding is needed
+                param.is_input_ragged = std::any_of(param.batch_input_ptr,
+                                                    param.batch_input_ptr + param.batch_intput_size,
+                                                    [&](int x) { return x != param.batch_input_ptr[0]; });
+
+                // calculate statics of elements
+                if (param.is_input_ragged) {
+                    param.max_elements_per_seq =
+                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
+                    param.total_input_elements =
+                        std::accumulate(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size, 0);
+                    batch_input_param_map.insert({batch_input_name, param});
+                    // verbose logging for debugging
+                    if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+                        std::string value_str = "[ ";
+                        for (size_t i = 0; i < param.batch_intput_size; i++) {
+                            value_str += std::to_string(param.batch_input_ptr[i]) + " ";
+                        }
+                        value_str += "]";
+
+                        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                                    (std::string("collect batch input name: ") + batch_input_name + "\n size: "
+                                     + std::to_string(dst_buffer_byte_size) + " bytes\n value: " + value_str
+                                     + "\n max sequence length: " + std::to_string(param.max_seq_length)
+                                     + "\n max elements per sequence: " + std::to_string(param.max_elements_per_seq))
+                                        .c_str());
+                    }
+                }
+            }
+        }
+    }
+
+    // Process user-defined inputs
+    for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(
+            responses, request_count, TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
+
+        const char*           input_name;
+        TRITONSERVER_DataType input_datatype;
+        const int64_t*        input_shape;
+        uint32_t              input_dims_count;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(
+            responses,
+            request_count,
+            TRITONBACKEND_InputProperties(
+                input, &input_name, &input_datatype, &input_shape, &input_dims_count, nullptr, nullptr));
+
+        input_names->emplace_back(input_name);
+
+        std::string input_name_str = std::string(input_name);
+
+        // Pad input ids from different requests
+        RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
+        if (batch_input_param_map.find(input_name_str + "_item_shape") != batch_input_param_map.end()
+            && batch_input_param_map[input_name_str + "_item_shape"].is_input_ragged) {
+            RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
+
+            const int64_t total_batch_size_int64     = (int64_t)total_batch_size;
+            const int64_t max_elements_per_seq_int64 = (int64_t)param.max_elements_per_seq;
+            const size_t  padded_input_ids_buffer_size =
+                GetByteSize(input_datatype, std::vector<int64_t>{total_batch_size_int64, max_elements_per_seq_int64});
+            // Always host memory
+            BackendMemory* padded_input_memory;
+            BackendMemory* request_input_memory;
+            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                            request_count,
+                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                  {BackendMemory::AllocationType::CPU},
+                                                                  0,
+                                                                  padded_input_ids_buffer_size,
+                                                                  &padded_input_memory));
+            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                            request_count,
+                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                  {BackendMemory::AllocationType::CPU},
+                                                                  0,
+                                                                  padded_input_ids_buffer_size,
+                                                                  &request_input_memory));
+
+            memset(padded_input_memory->MemoryPtr(), 0, padded_input_ids_buffer_size);
+
+            collector->ProcessTensor(
+                input_name,
+                request_input_memory->MemoryPtr(),
+                GetByteSize(input_datatype, std::vector<int64_t>{(int64_t)param.total_input_elements}),
+                request_input_memory->MemoryType(),
+                request_input_memory->MemoryTypeId());
+
+            int64_t accumulated_elements_offset = 0;
+
+            char* padded_input_ids_ptr = padded_input_memory->MemoryPtr();
+            char* base_input_ids       = request_input_memory->MemoryPtr();
+
+            // copy each request buffer to padded buffer
+            for (int64_t single_batch_idx = 0; single_batch_idx < total_batch_size_int64; single_batch_idx++) {
+                int32_t sequence_elements = param.batch_input_ptr[single_batch_idx];
+                std::memcpy(padded_input_ids_ptr
+                                + GetByteSize(input_datatype,
+                                              std::vector<int64_t>{single_batch_idx, max_elements_per_seq_int64}),
+                            base_input_ids
+                                + GetByteSize(input_datatype, std::vector<int64_t>{accumulated_elements_offset}),
+                            GetByteSize(input_datatype, std::vector<int64_t>{sequence_elements}));
+
+                accumulated_elements_offset += sequence_elements;
+            }
+
+            // modify batch dimension shape, and sequence length dimension shape after
+            // padding
+            std::vector<size_t> batchn_shape(input_shape, input_shape + input_dims_count);
+            if (max_batch_size != 0) {
+                batchn_shape[0] = total_batch_size;
+                batchn_shape[1] = (size_t)param.max_seq_length;
+                // assume all non-seq-length dimensions have the same shape
+                if (input_dims_count > 2) {
+                    batchn_shape[2] = (size_t)(param.max_elements_per_seq / param.max_seq_length);
+                }
+            }
+            (*input_tensors)
+                ->insert({std::string(input_name),
+                          triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape, padded_input_ids_ptr}});
+
+            continue;
+        }
+
+        // bool start_end_ready_flag = (input_name_str == "START" || input_name_str
+        // == "END"
+        //   || input_name_str == "READY");
+
+        // int shape_dims_start = (int) (sequence_batching_enabled &&
+        // !start_end_ready_flag);
+
+        // The shape for the entire input patch, [total_batch_size, ...]
+        std::vector<int64_t> batchn_shape(input_shape, input_shape + input_dims_count);
+        if (max_batch_size != 0) {
+            batchn_shape[0] = total_batch_size;
+        }
+
+        std::vector<size_t> batchn_shape_2(input_shape, input_shape + input_dims_count);
+        if (max_batch_size != 0) {
+            batchn_shape_2[0] = total_batch_size;
+        }
+
+        // std::vector<int64_t> batchn_shape(
+        //     input_shape + shape_dims_start, input_shape + input_dims_count);
+        // if (max_batch_size != 0 && !start_end_ready_flag) {
+        //   batchn_shape[0] = total_batch_size;
+        // }
+
+        // std::vector<size_t> batchn_shape_2(
+        //     input_shape + shape_dims_start, input_shape + input_dims_count);
+        // if (max_batch_size != 0 && !start_end_ready_flag) {
+        //   batchn_shape_2[0] = total_batch_size;
+        // }
+
+        // The input must be in contiguous CPU/GPU memory.
+        const int64_t batchn_byte_size = GetByteSize(input_datatype, batchn_shape);
+
+        // Always host memory
+        BackendMemory* input_memory;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                        request_count,
+                                        BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                              {BackendMemory::AllocationType::CPU},
+                                                              0,
+                                                              batchn_byte_size,
+                                                              &input_memory));
+        input_memories->push_back(input_memory);
+
+        TRITONSERVER_MemoryType memory_type    = input_memory->MemoryType();
+        int64_t                 memory_type_id = input_memory->MemoryTypeId();
+        char*                   input_buffer   = input_memory->MemoryPtr();
+
+        collector->ProcessTensor(input_name, input_buffer, batchn_byte_size, memory_type, memory_type_id);
+
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("collect name: ") + input_name + " size: " + std::to_string(batchn_byte_size) + " bytes")
+                .c_str());
+        (*input_tensors)
+            ->insert({std::string(input_name),
+                      triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape_2, input_buffer}});
+    }
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
+    // Finalize...
+    *cuda_copy |= collector->Finalize();
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
+}
+
+void ModelInstanceState::ReadOutputTensors(size_t                                                   total_batch_size,
+                                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
+                                           TRITONBACKEND_Request**                                  requests,
+                                           const uint32_t                                           request_count,
+                                           std::vector<TRITONBACKEND_Response*>*                    responses)
+{
+    BackendOutputResponder responder(requests,
+                                     request_count,
+                                     responses,
+                                     model_state_->MaxBatchSize(),
+                                     model_state_->TritonMemoryManager(),
+                                     model_state_->EnablePinnedInput(),
+                                     output_stream_);
+
+    bool cuda_copy = false;
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    std::vector<std::vector<char>> string_buffers;
+
+    int idx = 0;
+    for (auto it = output_tensors->begin(); it != output_tensors->end(); ++it) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("Get output_tensors ") + std::to_string(idx) + std::string(": ") + std::string(it->first))
+                .c_str());
+        idx++;
+        auto& output = it->second;
+
+        // Verify output datatype matches datatype from model config
+        TRITONSERVER_DataType output_dtype = output.type;
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                    (std::string("    output_type: ") + TRITONSERVER_DataTypeString(output_dtype)).c_str());
+
+        const char* output_buffer = static_cast<const char*>(output.data);
+
+        //  Set output shape
+        // std::vector<int64_t> batchn_shape = sequence_batching_enabled ?
+        // std::vector<int64_t>{1} :
+        //   std::vector<int64_t>{};
+        std::vector<int64_t> batchn_shape;
+        if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+            // std::string batch_shape_str = sequence_batching_enabled ? "    output
+            // shape: [1, " :
+            //   "    output shape: [";
+            std::string batch_shape_str = "    output shape: [";
+            for (uint i = 0; i < output.shape.size(); i++) {
+                batchn_shape.push_back(output.shape[i]);
+                batch_shape_str = batch_shape_str + std::to_string(output.shape[i]);
+                if (i != output.shape.size() - 1) {
+                    batch_shape_str = batch_shape_str + ", ";
+                }
+                else {
+                    batch_shape_str = batch_shape_str + "]";
+                }
+            }
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, batch_shape_str.c_str());
+        }
+        else {
+            batchn_shape.insert(batchn_shape.end(), output.shape.begin(), output.shape.end());
+        }
+
+        responder.ProcessTensor(it->first,
+                                output_dtype,
+                                batchn_shape,
+                                output_buffer,
+                                TRITONSERVER_MEMORY_GPU,
+                                model_instance_device_id_start_);
+    }
+
+    // Finalize and wait for any pending buffer copies.
+    cuda_copy |= responder.Finalize();
+
+#ifdef TRITON_ENABLE_GPU
+    if (cuda_copy) {
+        cudaStreamSynchronize(output_stream_);
+    }
+#endif  // TRITON_ENABLE_GPU
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("PERFORMED GPU copy: ") + (cuda_copy ? std::string("YES") : std::string("NO"))).c_str());
+}
+
+/////////////
+
+extern "C" {
+
+TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+    int provided;
+    ft::mpi::initThread(nullptr, nullptr, ft::mpi::THREAD_MULTIPLE, &provided);
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+    std::string name(cname);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
+
+    // Check the backend API version that Triton supports vs. what this
+    // backend was compiled against.
+    uint32_t api_version_major, api_version_minor;
+    RETURN_IF_ERROR(TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+                 + std::to_string(api_version_minor))
+                    .c_str());
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("'") + name
+                 + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "."
+                 + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+                    .c_str());
+
+    if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR)
+        || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+             + std::to_string(api_version_minor) + " does not support '" + name + "' TRITONBACKEND API version: "
+             + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+                .c_str());
+    }
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
+    std::string name(cname);
+
+    uint64_t version;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")").c_str());
+
+    // Create a ModelState object and associate it with the
+    // TRITONBACKEND_Model.
+    ModelState* model_state;
+    RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
+
+    delete model_state;
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: MPI Finalize");
+
+    ft::mpi::finalize();
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
+    std::string name(cname);
+
+    // Get the model state associated with this instance's model.
+    TRITONBACKEND_Model* model;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+    void* vmodelstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+    // Create a ModelInstanceState object and associate it with the
+    // TRITONBACKEND_ModelInstance.
+    ModelInstanceState* instance_state;
+    RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
+
+    int model_instance_id    = instance_state->GetModelInstanceId();
+    int model_instance_count = instance_state->GetModelInstanceCount();
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (count "
+                 + std::to_string(model_instance_count) + ")" + " (instance_id " + std::to_string(model_instance_id)
+                 + ")")
+                    .c_str());
+
+    if (node_id) {
+        while (true) {
+            instance_state->Execute(
+                nullptr, nullptr, 0, std::shared_ptr<std::unordered_map<std::string, Tensor>>(nullptr));
+        }
+    }
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+    ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+    delete instance_state;
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstance* instance,
+                                                       TRITONBACKEND_Request**      requests,
+                                                       const uint32_t               request_count)
+{
+    // Triton will not call this function simultaneously for the same
+    // 'instance'. But since this backend could be used by multiple
+    // instances from multiple models the implementation needs to handle
+    // multiple calls to this function at the same time (with different
+    // 'instance' objects). Suggested practice for this is to use only
+    // function-local and model-instance-specific state (obtained from
+    // 'instance'), which is what we do here.
+    ModelInstanceState* instance_state;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
+    ModelState* model_state = instance_state->StateForModel();
+
+    // This backend specifies BLOCKING execution policy. That means that
+    // we should not return from this function until execution is
+    // complete. Triton will automatically release 'instance' on return
+    // from this function so that it is again available to be used for
+    // another call to TRITONBACKEND_ModelInstanceExecute.
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("model ") + model_state->Name() + ", instance " + instance_state->Name() + ", executing "
+                 + std::to_string(request_count) + " requests")
+                    .c_str());
+
+    // At this point we accept ownership of 'requests', which means that
+    // even if something goes wrong we must still return success from
+    // this function. If something does go wrong in processing a
+    // particular request then we send an error response just for the
+    // specific request.
+    instance_state->ProcessRequests(requests, request_count);
+
+    return nullptr;  // success
+}
+
+}  // extern "C"
+
+}  // namespace fastertransformer_backend
+}  // namespace backend
+}  // namespace triton
--- a/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
+++ b/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/src/fastertransformer/triton_backend/llama/CMakeLists.txt
+++ b/src/fastertransformer/triton_backend/llama/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.8)
+
+set(llama_triton_backend_files
+    LlamaTritonModel.cc
+    LlamaTritonModelInstance.cc
+)
+
+add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
+set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt)
+target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/models/llama/LlamaInstanceComm.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/allocator.h"
+#include <mutex>
+
+namespace ft = fastertransformer;
+
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
+{
+    INIReader reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        return nullptr;
+    }
+
+    const std::string data_type        = reader.Get("ft_instance_hyperparameter", "data_type");
+    int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
+    std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
+
+    if (data_type == "half" || data_type == "fp16") {
+        return std::make_shared<LlamaTritonModel<half>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+    else {
+        return std::make_shared<LlamaTritonModel<float>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+}
+
+template<typename T>
+void LlamaTritonModel<T>::handleMissingParams()
+{
+    if (!max_batch_size_) {
+        max_batch_size_ = 32;
+        FT_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
+    }
+
+    if (!session_len_) {
+        session_len_ = 2160;
+        FT_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
+    }
+
+    if (!max_context_token_num_) {
+        max_context_token_num_ = (int)std::sqrt(max_batch_size_);
+        FT_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
+                       (int)max_context_token_num_);
+    }
+
+    if (!step_length_) {
+        step_length_ = 1;
+        FT_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
+    }
+
+    if (!cache_max_entry_count_) {
+        cache_max_entry_count_ = 32;
+        FT_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
+                       (int)cache_max_entry_count_);
+    }
+
+    if (!cache_chunk_size_) {
+        cache_chunk_size_ = cache_max_entry_count_;
+        FT_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
+    }
+}
+
+template<typename T>
+LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
+                                      size_t      pipeline_para_size,
+                                      int         enable_custom_all_reduce,
+                                      std::string model_dir):
+    tensor_para_size_(tensor_para_size),
+    pipeline_para_size_(pipeline_para_size),
+    shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    model_dir_ = model_dir;
+    const std::string inifile{model_dir + "/config.ini"};
+    INIReader         reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    model_name_            = reader.Get("llama", "model_name");
+    head_num_              = reader.GetInteger("llama", "head_num");
+    size_per_head_         = reader.GetInteger("llama", "size_per_head");
+    inter_size_            = reader.GetInteger("llama", "inter_size");
+    num_layer_             = reader.GetInteger("llama", "num_layer");
+    vocab_size_            = reader.GetInteger("llama", "vocab_size");
+    rotary_embedding_dim_  = reader.GetInteger("llama", "rotary_embedding");
+    norm_eps_              = reader.GetFloat("llama", "norm_eps");
+    start_id_              = reader.GetInteger("llama", "start_id");
+    end_id_                = reader.GetInteger("llama", "end_id");
+    max_batch_size_        = reader.GetInteger("llama", "max_batch_size", 0);
+    max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
+    session_len_           = reader.GetInteger("llama", "session_len", 0);
+    step_length_           = reader.GetInteger("llama", "step_length", 0);
+    cache_max_entry_count_ = reader.GetInteger("llama", "cache_max_entry_count", 0);
+    use_context_fmha_      = reader.GetInteger("llama", "use_context_fmha", 1);
+    cache_chunk_size_      = reader.GetInteger("llama", "cache_chunk_size", 0);
+    prefix_cache_len_      = reader.GetInteger("llama", "prefix_cache_len", 0);
+
+    handleMissingParams();
+
+    if (max_context_token_num_ <= max_batch_size_) {
+        max_context_token_num_ *= session_len_;
+    }
+
+    shared_state_          = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
+    shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
+
+    const auto device_count = ft::getDeviceCount();
+    shared_instances_.resize(device_count);
+    shared_mutexes_.resize(device_count);
+
+    const std::string weight_type_str = reader.Get("llama", "weight_type");
+    if (weight_type_str == "fp16") {
+        weight_type_ = ft::WeightType::kFP16;
+    }
+    else if (weight_type_str == "fp32") {
+        weight_type_ = ft::WeightType::kFP32;
+    }
+    else if (weight_type_str == "int8") {
+        weight_type_ = ft::WeightType::kINT8;
+    }
+    else if (weight_type_str == "int4") {
+        weight_type_ = ft::WeightType::kINT4;
+    }
+    else {
+        std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
+        ft::FT_CHECK(0);
+    }
+}
+
+template<typename T>
+std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSharedModelInstance(
+    int                                                               device_id,
+    int                                                               rank,
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    /// TODO: this stream handle is leaked
+    cudaStream_t stream{};
+    ft::check_cuda_error(cudaStreamCreate(&stream));
+
+    allocator->setStream(stream);
+
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+
+    std::unique_ptr<ft::cublasAlgoMap>   cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
+    std::unique_ptr<std::mutex>          cublas_wrapper_mutex(new std::mutex());
+    std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
+        cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
+
+    std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
+    ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
+
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper->setFP32GemmConfig();
+    }
+
+    ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
+    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
+
+    ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
+    ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
+
+    auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
+                                                  size_per_head_,
+                                                  inter_size_,
+                                                  num_layer_,
+                                                  vocab_size_,
+                                                  rotary_embedding_dim_,
+                                                  norm_eps_,
+                                                  max_batch_size_,
+                                                  max_context_token_num_,
+                                                  session_len_,
+                                                  step_length_,
+                                                  start_id_,
+                                                  end_id_,
+                                                  cache_max_entry_count_,
+                                                  cache_chunk_size_,
+                                                  use_context_fmha_,
+                                                  shared_state_,
+                                                  shared_weights_[device_id].get(),
+                                                  tensor_para,
+                                                  stream,
+                                                  cublas_wrapper.get(),
+                                                  allocator.get(),
+                                                  false,  // is_free_buffer_after_forward,
+                                                  cuda_device_prop_ptr.get());
+
+    return std::make_unique<LlamaTritonSharedModelInstance<T>>(
+        LlamaTritonSharedModelInstance<T>{std::move(llama),
+                                          shared_weights_[device_id],
+                                          std::move(allocator),
+                                          std::move(cublas_algo_map),
+                                          std::move(cublas_wrapper_mutex),
+                                          std::move(cublas_wrapper),
+                                          std::move(cuda_device_prop_ptr),
+                                          session_len_});
+}
+
+template<typename T>
+std::unique_ptr<AbstractTransformerModelInstance>
+LlamaTritonModel<T>::createModelInstance(int                                                               device_id,
+                                         int                                                               rank,
+                                         cudaStream_t                                                      stream,
+                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                                         std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    // const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance;
+    {
+        std::lock_guard<std::mutex> lock(shared_mutexes_[device_id]);
+        instance = shared_instances_[device_id].lock();
+        if (!instance) {
+            instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
+            shared_instances_[device_id] = instance;
+        }
+    }
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    allocator->setStream(stream);
+
+    return std::make_unique<LlamaTritonModelInstance<T>>(instance, std::move(allocator));
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int tensor_para_rank   = rank % tensor_para_size_;
+    const int pipeline_para_rank = rank / tensor_para_size_;
+    ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
+    shared_weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(head_num_ * size_per_head_,
+                                                                      inter_size_,
+                                                                      vocab_size_,
+                                                                      num_layer_,
+                                                                      weight_type_,
+                                                                      tensor_para_size_,
+                                                                      tensor_para_rank,
+                                                                      prefix_cache_len_);
+    shared_weights_[device_id]->loadModel(model_dir_);
+    return;
+}
+
+template<typename T>
+std::string LlamaTritonModel<T>::toString()
+{
+    std::stringstream ss;
+    ss << "Model: "
+       << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_
+       << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nmax_batch_size: " << max_batch_size_
+       << "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
+       << "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_entry_count_
+       << "\ncache_chunk_size: " << cache_chunk_size_ << "\nuse_context_fmha: " << use_context_fmha_
+       << "\nstart_id: " << start_id_ << "\ntensor_para_size: " << tensor_para_size_
+       << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nmodel_name: " << model_name_ << "\nprefix_cache_len: " << prefix_cache_len_
+       << "\nmodel_dir: " << model_dir_ << std::endl;
+
+    return ss.str();
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createCustomComms(
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
+{
+    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
+    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
+}
+
+template<typename T>
+std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
+{
+    const auto device_count     = ft::getDeviceCount();
+    bool       need_nccl_params = false;
+    // create nccl group when there are non-occupied devices
+    for (int i = 0; i < device_count; ++i) {
+        std::lock_guard<std::mutex> lock(shared_mutexes_[i]);
+        if (shared_instances_[i].expired()) {
+            need_nccl_params = true;
+            break;
+        }
+    }
+    if (need_nccl_params) {
+        return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
+    }
+    else {
+        FT_LOG_INFO("Skipping NCCL param creation.");
+
+        const int tensor_para_size   = getTensorParaSize();
+        const int pipeline_para_size = getPipelineParaSize();
+        const int local_comm_size    = multi_node ? device_count : tensor_para_size * pipeline_para_size;
+
+        std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
+        std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+        return {std::move(tensor_para_params), std::move(pipeline_para_params)};
+    }
+}
+
+template<typename T>
+std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
+{
+    return std::make_unique<ft::LlamaInstanceComm>(size);
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getTensorParaSize()
+{
+    return tensor_para_size_;
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getPipelineParaSize()
+{
+    return pipeline_para_size_;
+}
+
+template struct LlamaTritonModel<float>;
+template struct LlamaTritonModel<half>;
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <cuda_fp16.h>
+#include <mutex>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonSharedModelInstance;
+
+template<typename T>
+struct LlamaTritonModel: public AbstractTransformerModel {
+    LlamaTritonModel(size_t      tensor_para_size,
+                     size_t      pipeline_para_size,
+                     int         enable_custom_all_reduce,
+                     std::string model_dir);
+
+    ~LlamaTritonModel() = default;
+
+    std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+
+    void createSharedWeights(int deviceId, int rank) override;
+
+    void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                           int                                                   world_size) override;
+
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    createNcclParams(const int node_id, const int device_id_start, const bool multi_node) override;
+
+    std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
+
+    void handleMissingParams();
+
+    std::string toString() override;
+    int         getTensorParaSize() override;
+    int         getPipelineParaSize() override;
+
+private:
+    std::unique_ptr<LlamaTritonSharedModelInstance<T>>
+    createSharedModelInstance(int                                                               deviceId,
+                              int                                                               rank,
+                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                              std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
+
+    size_t         head_num_;
+    size_t         size_per_head_;
+    size_t         inter_size_;
+    size_t         num_layer_;
+    size_t         vocab_size_;
+    size_t         rotary_embedding_dim_;
+    float          norm_eps_;
+    int            max_batch_size_;
+    int            max_context_token_num_;
+    int            session_len_;
+    int            step_length_;
+    int            start_id_;
+    int            end_id_;
+    int            cache_max_entry_count_;
+    int            cache_chunk_size_;
+    int            use_context_fmha_;
+    size_t         tensor_para_size_;
+    size_t         pipeline_para_size_;
+    ft::WeightType weight_type_;
+
+    size_t prefix_cache_len_{};
+
+    // shared weights for each device
+    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
+
+    std::shared_ptr<typename ft::LlamaV2<T>::SharedState> shared_state_;
+
+    // weak_ptr is used so that the instances get released when all strong references are gone
+    std::vector<std::weak_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
+    std::deque<std::mutex>                                        shared_mutexes_;  // is locking really needed?
+
+    // // residual type
+    // bool use_gptj_residual_ = true;
+
+    // // number of tasks (for prefix-prompt, p/prompt-tuning)
+    // size_t                                     num_tasks_                  = 0;
+    // int                                        prompt_learning_start_id_   = 0;
+    // ft::PromptLearningType                     prompt_learning_type_       = ft::PromptLearningType::no_prompt;
+    // std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
+
+    bool is_fp16_;
+    int  enable_custom_all_reduce_ = 0;
+
+    std::string model_name_;
+    std::string model_dir_;
+};
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/triton_backend/triton_utils.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
+{
+    LlamaTritonModelInstance<T>* model  = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
+    auto                         result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
+
+    model->stream_cb_(result, model->stream_ctx_);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(
+    std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator):
+    instance_(std::move(instance)), allocator_(std::move(allocator))
+{
+}
+
+template<typename T>
+std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
+    move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
+
+    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
+    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
+    // freed in forward()
+    h_total_output_lengths_ = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
+        // {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)},
+    };
+
+    if (input_tensors->find("bad_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
+    }
+
+    if (input_tensors->find("stop_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
+    }
+
+    if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths")
+        && input_tensors->count("request_prompt_type")) {
+
+        move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_lengths",
+             as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
+
+        move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_embedding",
+             as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
+    }
+
+    if (input_tensors->find("top_p_decay") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
+        ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
+    }
+    if (input_tensors->find("top_p_min") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
+        ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
+    }
+    if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
+        ft_input_tensors.insert(
+            {"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
+    }
+
+    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+        if (t->first.find("input_ids") == std::string::npos  // && t->first.find("input_lengths") == std::string::npos
+            && t->first.find("output_seq_len") == std::string::npos
+            && t->first.find("prefix_soft_prompt_embedding") == std::string::npos
+            && t->first.find("prefix_soft_prompt_lengths") == std::string::npos) {
+            if (ft_input_tensors.count(t->first) == 0) {
+                ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
+            }
+        }
+    }
+
+    return ft_input_tensors;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
+        new std::unordered_map<std::string, triton::Tensor>();
+
+    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
+        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
+    }
+
+    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
+}
+
+template<typename T>
+std::shared_ptr<std::vector<triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
+{
+    ft::FT_CHECK(false);
+    return nullptr;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    ft::FT_CHECK(false);
+    return nullptr;
+}
+
+template<typename T>
+std::string format_vector(const std::vector<T>& vec)
+{
+    std::stringstream ss;
+    ss << "[";
+    bool first = true;
+    for (const auto& x : vec) {
+        ss << (first ? "" : ", ") << x;
+        first = false;
+    }
+    ss << "]";
+    return ss.str();
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
+                                     ft::AbstractInstanceComm*                                        instance_comm)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // for (const auto& kv : *input_tensors) {
+    //     FT_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
+    // }
+
+    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
+                       "input_tensors->at(\"input_ids\").shape.size() == 2");
+    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
+                       "input_tensors->at(\"input_lengths\").shape.size() == 1");
+
+    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
+    const uint32_t max_request_output_len = (size_t)*std::max_element(
+        (int*)input_tensors->at("request_output_len").data,
+        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
+    // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
+    const uint32_t beam_width =
+        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+    FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
+
+    allocateBuffer(request_batch_size, beam_width, instance_->session_len);
+
+    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"output_ids",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
+                    d_output_ids_}},
+        {"sequence_length",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width},
+                    d_sequence_lengths_}}};
+
+    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
+        output_tensors.insert({"output_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
+                                          d_output_log_probs_}});
+        output_tensors.insert({"cum_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width},
+                                          d_cum_log_probs_}});
+    }
+    try {
+        ft::Request::Callback callback;
+
+        if (stream_cb_) {
+            callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
+                triton_stream_callback<T>(outputs, this);
+            };
+        }
+
+        ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
+        instance_->llm->forward(&output_tensors, &ft_input_tensors, {instance_comm, callback});
+        // ! stream synced by the model before returning
+    }
+    catch (...) {
+        h_exception_ = std::current_exception();
+        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+    }
+
+    if (h_total_output_lengths_ != nullptr) {
+        free(h_total_output_lengths_);
+        h_total_output_lengths_ = nullptr;
+    }
+
+    return convert_outputs(output_tensors);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
+{
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
+                                                 const size_t beam_width,
+                                                 const size_t session_len)
+{
+    d_output_ids_ =
+        (int*)(allocator_->reMalloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len, false));
+    d_sequence_lengths_ =
+        (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
+    d_output_log_probs_ = (float*)(allocator_->reMalloc(
+        d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * session_len, false));
+    d_cum_log_probs_ =
+        (float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false));
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::freeBuffer()
+{
+    allocator_->free((void**)(&d_output_ids_));
+    allocator_->free((void**)(&d_sequence_lengths_));
+    allocator_->free((void**)(&d_output_log_probs_));
+    allocator_->free((void**)(&d_cum_log_probs_));
+}
+
+template struct LlamaTritonModelInstance<float>;
+template struct LlamaTritonModelInstance<half>;