build turbomind (#35)

* build turbomind * change namespace fastertransformer to turbomind * change logger name

build turbomind (#35)
* build turbomind * change namespace fastertransformer to turbomind * change logger name
35d64462 · lvhan028 · GitHub · 53d2e42c · 35d64462 · 35d64462
Unverified Commit 35d64462 authored Jul 01, 2023 by lvhan028 Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/turbomind/models/llama/LlamaCacheManager.cc
+++ b/src/turbomind/models/llama/LlamaCacheManager.cc
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/models/llama/LlamaCacheManager.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"

-namespace fastertransformer {
+namespace turbomind {

 LlamaCacheManager::~LlamaCacheManager()
 {
@@ -16,7 +16,7 @@ LlamaCacheManager::~LlamaCacheManager()
 void* LlamaCacheManager::allocate(bool is_preallocte)
 {
    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][allocate]");
+        TM_LOG_INFO("[LlamaCacheManager][allocate]");
    }

    void* mem_ptr{};
@@ -26,7 +26,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
        device_free_.pop();

        if (rank_ == 0) {
-            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+            TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
        }
    }
    else if (entry_count_ < max_entry_count_) {
@@ -34,14 +34,14 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
        const size_t entry_byte_size = 2 * cache_byte_size_;  // 2 for k,v

        if (rank_ == 0) {
-            FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
+            TM_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
        }
        const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
        FT_CHECK(chunk_ptr);
        device_mem_.push_back(chunk_ptr);
        entry_count_ += alloc_count;
        if (rank_ == 0) {
-            FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
+            TM_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
        }

        for (int i = 0; i < alloc_count; ++i) {
@@ -54,7 +54,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
        }

        if (rank_ == 0) {
-            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+            TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
        }
    }
    else {
@@ -68,13 +68,13 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
 auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
 {
    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
+        TM_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
    }

    for (const auto& e : device_cache_) {
        if (e.id == id) {
            if (rank_ == 0) {
-                FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
+                TM_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
            }
            erase(id);
        }
@@ -102,7 +102,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
    auto pred = [&](const Sequence& s) { return s.id == id; };
    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
    if (it == device_cache_.end()) {
-        FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
+        TM_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
        FT_CHECK(0);
    }
    return it;
@@ -111,7 +111,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
 auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
 {
    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
+        TM_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
    }

    auto entry = getEntryOrThrow(id);
@@ -131,7 +131,7 @@ auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
 void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
 {
    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
+        TM_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
    }

    auto entry = getEntryOrThrow(seq.id);
@@ -145,7 +145,7 @@ void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
 void LlamaCacheManager::erase(uint64_t id)
 {
    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
+        TM_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
    }

    auto entry = getEntryOrThrow(id);
@@ -153,7 +153,7 @@ void LlamaCacheManager::erase(uint64_t id)
    if (entry->k_cache) {
        device_free_.push(entry->k_cache);
        if (rank_ == 0) {
-            FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
+            TM_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
        }
    }
    device_cache_.erase(entry);
@@ -171,7 +171,7 @@ void* LlamaCacheManager::evict()
    }

    if (rank_ == 0) {
-        FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
+        TM_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
    }

    FT_CHECK(it->k_cache);
@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept
    return it != device_cache_.end();
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaCacheManager.h
+++ b/src/turbomind/models/llama/LlamaCacheManager.h
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/logger.h"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <queue>
 #include <unordered_map>
 #include <vector>

-namespace fastertransformer {
+namespace turbomind {

 // k-cache layout [L, H, D/x, S[s:], x]
 // v-cache layout [L, H, S[s:], D/x, x]
@@ -36,8 +36,8 @@ public:
        allocator_(allocator)
    {
        if (rank == 0) {
-            FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
-            FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
+            TM_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
+            TM_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
        }
        allocate(true);
    }
@@ -99,4 +99,4 @@ private:
    std::vector<Sequence> device_cache_;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -17,18 +17,18 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.cc

-#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
-#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
-#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/models/llama/llama_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
+#include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/unfused_attention_kernels.h"
+#include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
@@ -36,7 +36,7 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
                                                   size_t max_q_len,
                                                   size_t max_k_len)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    // no padding
    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
@@ -75,7 +75,7 @@ template<typename T>
 void LlamaContextAttentionLayer<T>::freeBuffer()
 {
    if (is_allocate_buffer_) {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);

        allocator_->free((void**)(&qkv_buf_));
        allocator_->free((void**)(&q_buf_2_));
@@ -98,7 +98,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                                   const TensorMap*               input_tensors,
                                                   const LlamaAttentionWeight<T>* weights)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    /**
     * input_tensors:
@@ -403,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_c
 template class LlamaContextAttentionLayer<float>;
 template class LlamaContextAttentionLayer<half>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -17,16 +17,16 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.h

 #pragma once

-#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
-#include "src/fastertransformer/models/llama/LlamaLinear.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/nccl_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaContextAttentionLayer {
@@ -124,4 +124,4 @@ private:
    bool is_allocate_buffer_ = false;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -16,17 +16,17 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.cc

-#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
-#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
-#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
-#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/utils/Tensor.h"
+#include "src/turbomind/models/llama/LlamaContextDecoder.h"
+#include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/models/llama/LlamaContextDecoder.h"
+#include "src/turbomind/models/llama/llama_decoder_kernels.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/Tensor.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void LlamaContextDecoder<T>::allocateBuffer()
@@ -37,7 +37,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
 template<typename T>
 void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    attn_ffn_io_    = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
    attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
@@ -50,7 +50,7 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token,
 template<typename T>
 void LlamaContextDecoder<T>::freeBuffer()
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (is_allocate_buffer_) {
        allocator_->free((void**)&attn_ffn_io_);
        allocator_->free((void**)&padding_offset_);
@@ -94,7 +94,7 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
                                             int                                            layer,
                                             bool                                           is_final)
 {
-    // FT_LOG_ERROR(__PRETTY_FUNCTION__);
+    // TM_LOG_ERROR(__PRETTY_FUNCTION__);
    TensorMap self_attention_input_tensors{
        {"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
        {"attention_mask",
@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaContextDecoder<float>;
 template class LlamaContextDecoder<half>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaContextDecoder.h
+++ b/src/turbomind/models/llama/LlamaContextDecoder.h
@@ -16,25 +16,25 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.h

 #pragma once

-// #include "src/fastertransformer/kernels/add_residual_kernels.h"
-// #include "src/fastertransformer/kernels/layernorm_kernels.h"
-#include "src/fastertransformer/layers/BaseLayer.h"
-// #include "src/fastertransformer/layers/FfnLayer.h"
-// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
-#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
-#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
-
-namespace fastertransformer {
+// #include "src/turbomind/kernels/add_residual_kernels.h"
+// #include "src/turbomind/kernels/layernorm_kernels.h"
+#include "src/turbomind/layers/BaseLayer.h"
+// #include "src/turbomind/layers/FfnLayer.h"
+// #include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/utils/nccl_utils.h"
+
+namespace turbomind {

 template<typename T>
 class LlamaContextDecoder: public BaseLayer {
@@ -112,4 +112,4 @@ public:
                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -17,14 +17,14 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc

-#include "src/fastertransformer/models/llama/LlamaDecoder.h"
-#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/LlamaDecoder.h"
+#include "src/turbomind/models/llama/llama_decoder_kernels.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
@@ -50,14 +50,14 @@ LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
    tensor_para_(tensor_para),
    data_type_(getTensorType<T>())
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    initialize(quant_policy);
 }

 template<typename T>
 LlamaDecoder<T>::~LlamaDecoder()
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    delete self_attention_layer_;
    delete silu_ffn_layer_;
 }
@@ -65,7 +65,7 @@ LlamaDecoder<T>::~LlamaDecoder()
 template<typename T>
 void LlamaDecoder<T>::initialize(int quant_policy)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
                                                                  size_per_head_,
@@ -97,14 +97,14 @@ void LlamaDecoder<T>::allocateBuffer()
 template<typename T>
 void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    is_allocate_buffer_ = true;
 }

 template<typename T>
 void LlamaDecoder<T>::freeBuffer()
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (is_allocate_buffer_) {
        is_allocate_buffer_ = false;
    }
@@ -116,7 +116,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&
                                      const std::unordered_map<std::string, Tensor>* input_tensors,
                                      size_t                                         layer)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    TensorMap self_attention_input_tensors(*input_tensors);
    self_attention_input_tensors.insert("input_query",
                                        {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
@@ -157,7 +157,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                              const std::unordered_map<std::string, Tensor>*  input_tensors,
                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    /**
     * input_tensors:
     *   \param decoder_input [batch_size, hidden_dims]
@@ -242,4 +242,4 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
 template class LlamaDecoder<half>;
 template class LlamaDecoder<float>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoder.h
+++ b/src/turbomind/models/llama/LlamaDecoder.h
@@ -17,17 +17,17 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h

-#include "src/fastertransformer/layers/BaseLayer.h"
-// #include "src/fastertransformer/layers/FfnLayer.h"
-#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
-#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/turbomind/layers/BaseLayer.h"
+// #include "src/turbomind/layers/FfnLayer.h"
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/utils/nccl_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaDecoder: public BaseLayer {
@@ -93,4 +93,4 @@ public:
                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -16,13 +16,13 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc

-#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/fastertransformer/utils/logger.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t     hidden_units,
@@ -122,12 +122,12 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);

-    fastertransformer::mallocWeights(self_attn_weights.qkv, attn_bias_);
-    fastertransformer::mallocWeights(self_attn_weights.output, attn_bias_);
+    turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
+    turbomind::mallocWeights(self_attn_weights.output, attn_bias_);

-    fastertransformer::mallocWeights(ffn_weights.gating, false);
-    fastertransformer::mallocWeights(ffn_weights.intermediate, false);
-    fastertransformer::mallocWeights(ffn_weights.output, false);
+    turbomind::mallocWeights(ffn_weights.gating, false);
+    turbomind::mallocWeights(ffn_weights.intermediate, false);
+    turbomind::mallocWeights(ffn_weights.output, false);
 }

 template<typename T>
@@ -175,4 +175,4 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
 template struct LlamaDecoderLayerWeight<float>;
 template struct LlamaDecoderLayerWeight<half>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -16,13 +16,13 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h

 #pragma once

-#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 struct LlamaDecoderLayerWeight {
@@ -58,4 +58,4 @@ private:
    void mallocWeights();
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -16,18 +16,18 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
-#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
-#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
-#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/models/llama/llama_utils.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/nvtx_utils.h"
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
+#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
+#include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/nvtx_utils.h"
 #include <string>
 // #include <glog/logging.h>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 struct SATypeConverter {
@@ -157,7 +157,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
 template<typename T>
 void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    qkv_buf_ =
        reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
    context_buf_ =
@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
 template class LlamaDecoderSelfAttentionLayer<float>;
 template class LlamaDecoderSelfAttentionLayer<half>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -16,16 +16,16 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h

 #pragma once

-#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
-#include "src/fastertransformer/models/llama/LlamaLinear.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/nccl_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaDecoderSelfAttentionLayer {
@@ -97,4 +97,4 @@ private:
    bool is_allocate_buffer_{};
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -15,15 +15,15 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/DenseWeight.h

 #pragma once

-#include "src/fastertransformer/layers/FfnWeight.h"
-#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/layers/FfnWeight.h"
+#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 enum class WeightType : int {
    kFP32,
@@ -75,4 +75,4 @@ struct LlamaFfnWeight {
    LlamaDenseWeight<T> output;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -15,15 +15,15 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.h

-#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
-#include "src/fastertransformer/kernels/activation_kernels.h"
-#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
-#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/utils/nvtx_utils.h"
 // #include <glog/logging.h>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 template class LlamaFfnLayer<float>;
 template class LlamaFfnLayer<half>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -15,18 +15,18 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.cc

 #pragma once

-// #include "src/fastertransformer/layers/FfnLayer.h"
-#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/fastertransformer/models/llama/LlamaLinear.h"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
+// #include "src/turbomind/layers/FfnLayer.h"
+#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <functional>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaFfnLayer {
@@ -82,4 +82,4 @@ private:
    bool is_allocate_buffer_{};
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaInstanceComm.h
+++ b/src/turbomind/models/llama/LlamaInstanceComm.h
@@ -2,10 +2,10 @@

 #pragma once

-#include "src/fastertransformer/models/llama/Barrier.h"
-#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/turbomind/models/llama/Barrier.h"
+#include "src/turbomind/utils/instance_comm.h"

-namespace fastertransformer {
+namespace turbomind {

 class LlamaInstanceComm: public AbstractInstanceComm {
 public:
@@ -31,4 +31,4 @@ private:
    void*   ptr{};
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -2,12 +2,12 @@

 #pragma once

-#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaLinear {
@@ -58,4 +58,4 @@ private:
    cudaStream_t     stream_{};
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaNcclGuard.h
+++ b/src/turbomind/models/llama/LlamaNcclGuard.h
@@ -2,14 +2,14 @@

 #pragma once

-#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <array>
 #include <atomic>
 #include <condition_variable>
 #include <cuda_runtime.h>
 #include <mutex>

-namespace fastertransformer {
+namespace turbomind {

 struct NcclGuard {
    static constexpr int kMaxGroupCount = 32;
@@ -89,4 +89,4 @@ struct NcclGuard {
    std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -18,24 +18,24 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
-
-#include "src/fastertransformer/models/llama/LlamaV2.h"
-#include "src/fastertransformer/kernels/decoding_kernels.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
-#include "src/fastertransformer/models/llama/LlamaBatch.h"
-#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
-#include "src/fastertransformer/models/llama/LlamaWeight.h"
-#include "src/fastertransformer/models/llama/Request.h"
-#include "src/fastertransformer/models/llama/llama_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.cc
+
+#include "src/turbomind/models/llama/LlamaV2.h"
+#include "src/turbomind/kernels/decoding_kernels.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/models/llama/LlamaBatch.h"
+#include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/models/llama/LlamaWeight.h"
+#include "src/turbomind/models/llama/Request.h"
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include <functional>
 #include <memory>
 #include <sstream>
 #include <stdexcept>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 LlamaV2<T>::LlamaV2(size_t                       head_num,
@@ -87,15 +87,15 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
    shared_state_(shared_state)

 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
-    FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
+    TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);

    size_t elem_bits = 0;
    if (quant_policy & QuantPolicy::kCacheKVInt8) {
        elem_bits = sizeof(int8_t) * 8;
        if (use_context_fmha) {
-            FT_LOG_ERROR("use_context_fmha not support int8");
+            TM_LOG_ERROR("use_context_fmha not support int8");
            assert(0);
        }
    }
@@ -128,7 +128,7 @@ LlamaV2<T>::~LlamaV2()
 template<typename T>
 void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    context_decoder_ = new LlamaContextDecoder<T>(head_num_,
                                                  size_per_head_,
@@ -170,7 +170,7 @@ void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
 template<typename T>
 void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    // ! This kernel can't be used in context decoding
    invokeEmbeddingLookupPosEncodingPadCount(embeddings,
                                             weights_->pre_decoder_embedding_table,
@@ -203,10 +203,10 @@ void LlamaV2<T>::contextDecode(T*         deocder_output,
                               size_t     session_len,
                               size_t     batch_size)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    if (tensor_para_.rank_ == 0) {
-        FT_LOG_INFO("context decoding start");
+        TM_LOG_INFO("context decoding start");
    }

    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
@@ -250,7 +250,7 @@ void LlamaV2<T>::contextDecode(T*         deocder_output,
    context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);

    if (tensor_para_.rank_ == 0) {
-        FT_LOG_INFO("context decoding end");
+        TM_LOG_INFO("context decoding end");
    }
 }

@@ -267,7 +267,7 @@ void LlamaV2<T>::decoderForward(T*         decoder_output,
                                size_t     session_len,
                                size_t     batch_size)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    const int  max_seq_len = session_len;
    const auto dtype       = getTensorType<T>();
@@ -298,7 +298,7 @@ void LlamaV2<T>::decoderForward(T*         decoder_output,
 template<typename T>
 void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    cudaDataType_t data_type = getCudaDataType<T>();
    float          alpha     = 1.f;
    float          beta      = 0.f;
@@ -375,7 +375,7 @@ void LlamaV2<T>::dynamicDecode(int*            token_ids,
                               size_t          token_ids_len,
                               size_t          batch_size)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    int local_batch_size = (int)batch_size;

    std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
@@ -421,7 +421,7 @@ void LlamaV2<T>::dynamicDecode(int*            token_ids,
 template<typename T>
 void LlamaV2<T>::internalThreadEntry(int device_id)
 {
-    FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
+    TM_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
    check_cuda_error(cudaSetDevice(device_id));

    auto& request_queue  = shared_state_->request_queue;
@@ -514,10 +514,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
    if (debug_) {
        if (tensor_para_.rank_ == 0) {
            for (const auto& kv : *inputs) {
-                FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+                TM_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
            }
            for (const auto& kv : *outputs) {
-                FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+                TM_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
            }
        }
    }
@@ -566,10 +566,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
    std::vector<int> error_codes;
    bool             has_error = 0;
    if (rank == 0) {
-        FT_LOG_INFO("[forward] Enqueue requests");
+        TM_LOG_INFO("[forward] Enqueue requests");
        auto futures = shared_state_->request_queue.enqueue(std::move(requests));

-        FT_LOG_INFO("[forward] Wait for requests to complete ...");
+        TM_LOG_INFO("[forward] Wait for requests to complete ...");
        for (auto& f : futures) {
            auto ec = f.get();
            error_codes.push_back(ec);
@@ -594,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
 template class LlamaV2<half>;
 template class LlamaV2<float>;

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -17,24 +17,24 @@
 */

 // Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.h

 #pragma once

-#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
-#include "src/fastertransformer/models/llama/Barrier.h"
-#include "src/fastertransformer/models/llama/LlamaBatch.h"
-#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
-#include "src/fastertransformer/models/llama/LlamaDecoder.h"
-#include "src/fastertransformer/models/llama/LlamaWeight.h"
-#include "src/fastertransformer/models/llama/Request.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/instance_comm.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/turbomind/layers/DynamicDecodeLayer.h"
+#include "src/turbomind/models/llama/Barrier.h"
+#include "src/turbomind/models/llama/LlamaBatch.h"
+#include "src/turbomind/models/llama/LlamaContextDecoder.h"
+#include "src/turbomind/models/llama/LlamaDecoder.h"
+#include "src/turbomind/models/llama/LlamaWeight.h"
+#include "src/turbomind/models/llama/Request.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/instance_comm.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <unordered_map>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 class LlamaV2 {
@@ -183,4 +183,4 @@ private:
    std::thread internal_thread_;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind