build turbomind (#35)

* build turbomind * change namespace fastertransformer to turbomind * change logger name

build turbomind (#35)
* build turbomind * change namespace fastertransformer to turbomind * change logger name
35d64462 · lvhan028 · GitHub · 53d2e42c · 35d64462 · 35d64462
Unverified Commit 35d64462 authored Jul 01, 2023 by lvhan028 Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/turbomind/triton_backend/triton_utils.hpp
+++ b/src/turbomind/triton_backend/triton_utils.hpp
@@ -16,10 +16,10 @@

 #pragma once

-#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
-#include "src/fastertransformer/utils/Tensor.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/Tensor.h"

-namespace ft = fastertransformer;
+namespace ft = turbomind;

 template<typename T>
 void move_tensor_H2D(const triton::Tensor&                                          tensor,

--- a/src/turbomind/utils/IA3.h
+++ b/src/turbomind/utils/IA3.h
@@ -16,7 +16,7 @@

 #pragma once

-namespace fastertransformer {
+namespace turbomind {

 enum IA3_config {
    KEY_ADAPTER   = 1 << 0,
@@ -43,4 +43,4 @@ static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
    return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/ScaleList.h
+++ b/src/turbomind/utils/ScaleList.h
@@ -17,7 +17,7 @@
 #pragma once
 #include "stdlib.h"

-namespace fastertransformer {
+namespace turbomind {

 #define ACTIVATION_AMAX_NUM 72
 #define INT8O_GEMM_NUM 8
@@ -48,4 +48,4 @@ struct ScaleList {
    size_t       p4_offset_    = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/Tensor.cc
+++ b/src/turbomind/utils/Tensor.cc
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/string_utils.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/string_utils.h"

 #include "stdlib.h"
 #include <cuda_fp16.h>
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>

-namespace fastertransformer {
+namespace turbomind {

 Tensor::Tensor():
    // a none tensor.
@@ -271,7 +271,7 @@ std::string Tensor::getNumpyTypeDesc(DataType type) const
                                                                    {TYPE_FP64, "f8"}};

    if (type == TYPE_BF16) {
-        FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
+        TM_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
                       "support bfloat16 as of now, it will be properly extended if numpy supports. "
                       "Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
    }
@@ -352,7 +352,7 @@ TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
            insert(kv.first, kv.second);
        }
        else {
-            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
+            TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
        }
    }
 }
@@ -371,7 +371,7 @@ TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tenso
            insert(pair.first, pair.second);
        }
        else {
-            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
+            TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
        }
    }
 }
@@ -456,4 +456,4 @@ void TensorMap::saveNpy(const std::string& base_folder)
    }
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/Tensor.h
+++ b/src/turbomind/utils/Tensor.h
@@ -16,10 +16,10 @@

 #pragma once

-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/string_utils.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/string_utils.h"

 #include "stdlib.h"
 #include <cuda_fp16.h>
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <vector>

-namespace fastertransformer {
+namespace turbomind {

 typedef enum datatype_enum {
    TYPE_INVALID,
@@ -135,13 +135,13 @@ struct Tensor {
    template<typename T>
    inline T getVal(size_t index) const
    {
-        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
        FT_CHECK(where == MEMORY_CPU);
        FT_CHECK(data != nullptr);
        FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");

        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -151,9 +151,9 @@ struct Tensor {
    template<typename T>
    inline T getVal() const
    {
-        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -163,9 +163,9 @@ struct Tensor {
    template<typename T>
    inline T* getPtr() const
    {
-        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getPtr with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getPtr with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -174,7 +174,7 @@ struct Tensor {

    inline void* getPtrWithOffset(size_t offset) const
    {
-        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
        if (data == nullptr) {
            return (void*)data;
        }
@@ -187,9 +187,9 @@ struct Tensor {
    template<typename T>
    inline T* getPtrWithOffset(size_t offset) const
    {
-        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -207,7 +207,7 @@ struct Tensor {
    T max() const
    {
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -230,7 +230,7 @@ struct Tensor {
    T min() const
    {
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -253,7 +253,7 @@ struct Tensor {
    T any(T val) const
    {
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -272,7 +272,7 @@ struct Tensor {
    T all(T val) const
    {
        if (getTensorType<T>() != type) {
-            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
                         getNumpyTypeDesc(type).c_str());
        }
@@ -324,7 +324,7 @@ public:

    inline bool isExist(const std::string& key) const
    {
-        FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
+        TM_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
        return tensor_map_.find(key) != tensor_map_.end();
    }

@@ -355,7 +355,7 @@ public:

    inline Tensor& at(const std::string& key)
    {
-        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
        FT_CHECK_WITH_INFO(isExist(key),
                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
                                  key.c_str(),
@@ -374,7 +374,7 @@ public:

    inline Tensor& at(const std::string& key, Tensor& default_tensor)
    {
-        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
        if (isExist(key)) {
            return tensor_map_.at(key);
        }
@@ -383,7 +383,7 @@ public:

    inline Tensor at(const std::string& key, Tensor& default_tensor) const
    {
-        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
        if (isExist(key)) {
            return tensor_map_.at(key);
        }
@@ -392,7 +392,7 @@ public:

    inline Tensor& at(const std::string& key, Tensor&& default_tensor)
    {
-        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
        if (isExist(key)) {
            return tensor_map_.at(key);
        }
@@ -518,4 +518,4 @@ public:
    void             saveNpy(const std::string& base_folder);
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/activation_types.h
+++ b/src/turbomind/utils/activation_types.h
@@ -16,9 +16,9 @@

 #pragma once

-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 enum class ActivationType {
    Gelu,
@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type)
           || activaiton_type == ActivationType::SiGLU;
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -41,13 +41,13 @@
 #include <memory>
 #endif

-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/utils/logger.h"

 #if defined(CUDART_VERSION) && CUDART_VERSION < 11020
 #define CUDA_MEMORY_POOL_DISABLED
 #endif

-namespace fastertransformer {
+namespace turbomind {

 enum class AllocatorType {
    CUDA,
@@ -74,26 +74,26 @@ public:
    template<typename T>
    void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        size              = ((size + 31) / 32) * 32;  // make the buffer align with 32 bytes
        void* void_ptr    = (void*)ptr;
        void* ptr_address = getAddress(void_ptr);
        if (isExist(ptr_address)) {
            ReallocType realloc_type = isReMalloc(ptr_address, size);
            if (realloc_type == ReallocType::INCREASE) {
-                FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
+                TM_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
                free((void**)(&void_ptr), is_host);
                return malloc(size, is_set_zero, is_host);
            }
 #if !defined(CUDA_MEMORY_POOL_DISABLED)
            else if (realloc_type == ReallocType::DECREASE) {
-                FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
+                TM_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
                free((void**)(&void_ptr), is_host);
                return malloc(size, is_set_zero, is_host);
            }
 #endif
            else {
-                FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
+                TM_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
                if (is_set_zero) {
                    memSet(void_ptr, 0, size);
                }
@@ -101,7 +101,7 @@ public:
            }
        }
        else {
-            FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
+            TM_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
            return malloc(size, is_set_zero, is_host);
        }
    }
@@ -147,10 +147,10 @@ private:
 public:
    Allocator(int device_id): device_id_(device_id)
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        pointer_mapping_ = new std::unordered_map<void*, size_t>();
 #if defined(CUDA_MEMORY_POOL_DISABLED)
-        FT_LOG_WARNING(
+        TM_LOG_WARNING(
            "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
            "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
 #else
@@ -166,7 +166,7 @@ public:
            }
            check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
            if (!peer_access_available) {
-                FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+                TM_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
                               + " is not available.");
                continue;
            }
@@ -183,7 +183,7 @@ public:

    virtual ~Allocator()
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        while (!pointer_mapping_->empty()) {
            free((void**)(&pointer_mapping_->begin()->first));
        }
@@ -202,7 +202,7 @@ public:

    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        if (size == 0) {
            return nullptr;
        }
@@ -224,7 +224,7 @@ public:
            check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
        }
        check_cuda_error(getSetDevice(o_device));
-        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
+        TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);

        pointer_mapping_->insert({getAddress(ptr), size});

@@ -233,12 +233,12 @@ public:

    void free(void** ptr, bool is_host = false) const
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        void* address = getAddress(*ptr);
        if (*ptr != nullptr) {
            int o_device = 0;
            if (pointer_mapping_->count(address)) {
-                FT_LOG_DEBUG("Free buffer %p", address);
+                TM_LOG_DEBUG("Free buffer %p", address);
                check_cuda_error(getSetDevice(device_id_, &o_device));
                if (is_host) {
                    check_cuda_error(cudaFreeHost(*ptr));
@@ -255,7 +255,7 @@ public:
                pointer_mapping_->erase(address);
            }
            else {
-                FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+                TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
            }
        }
        *ptr = nullptr;
@@ -287,7 +287,7 @@ class Allocator<AllocatorType::TF>: public IAllocator {
        for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
            current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
        }
-        FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
+        TM_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
        if (current_buffer_size < size) {
            return ReallocType::INCREASE;
        }
@@ -317,7 +317,7 @@ public:

    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        tensorflow::Tensor buf;
        long long int      buf_size = ((long long int)ceil(size / 32.) * 32);
        tensorflow::Status status;
@@ -347,7 +347,7 @@ public:

    void free(void** ptr, bool is_host = false) const
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        void* address = getAddress(*ptr);
        pointer_mapping_->erase(address);
        *ptr = nullptr;
@@ -387,7 +387,7 @@ class Allocator<AllocatorType::TH>: public IAllocator {
        for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
            current_buffer_size *= pointer_mapping_->at(address).size(i);
        }
-        FT_LOG_DEBUG(
+        TM_LOG_DEBUG(
            "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
        if (current_buffer_size < size) {
            return ReallocType::INCREASE;
@@ -419,7 +419,7 @@ public:

    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        int64_t       buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
        torch::Tensor buf;
        if (is_host) {
@@ -432,14 +432,14 @@ public:
        if (is_set_zero) {
            cudaMemset(ptr, 0, buf_size);
        }
-        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
+        TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
        pointer_mapping_->insert({getAddress(ptr), buf});
        return ptr;
    }

    void free(void** ptr, bool is_host = false) const
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        void* address = getAddress(*ptr);
        pointer_mapping_->erase(address);
        *ptr = nullptr;
@@ -448,7 +448,7 @@ public:

    virtual ~Allocator()
    {
-        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
        while (!pointer_mapping_->empty()) {
            void* ptr = pointer_mapping_->begin()->second.data_ptr();
            free((void**)(&ptr));
@@ -463,4 +463,4 @@ public:
    }
 };
 #endif
-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/conv2d.h
+++ b/src/turbomind/utils/conv2d.h
@@ -23,7 +23,7 @@
 #include <cuda_fp16.h>
 #include <cudnn.h>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 void conv2d(T*             output,
@@ -134,4 +134,4 @@ void conv2d(T*             output,
    checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasAlgoMap.cc
+++ b/src/turbomind/utils/cublasAlgoMap.cc
@@ -16,7 +16,7 @@

 #include "cublasAlgoMap.h"

-namespace fastertransformer {
+namespace turbomind {

 cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
    config_filename_(filename), sp_config_filename_(sp_config_filename)
@@ -223,4 +223,4 @@ bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n,
    }
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasAlgoMap.h
+++ b/src/turbomind/utils/cublasAlgoMap.h
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -24,7 +24,7 @@
 #include <utility>

 #pragma once
-namespace fastertransformer {
+namespace turbomind {

 #define GEMM_NUM 6
 #define GEMM_CONFIG "gemm_config.in"
@@ -102,4 +102,4 @@ public:
    getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasFP8MMWrapper.cu
+++ b/src/turbomind/utils/cublasFP8MMWrapper.cu
@@ -17,7 +17,7 @@
 #include "cublasFP8MMWrapper.h"
 #include "cuda_utils.h"

-namespace fastertransformer {
+namespace turbomind {

 #define CUBLAS_WORKSPACE_1MB 1048576
 cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
@@ -27,7 +27,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
                                       IAllocator*      allocator):
    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
    cublasVersionCheck();

@@ -44,7 +44,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t   cublas_handle,
                                       IAllocator*      allocator):
    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
    cublasVersionCheck();
    if (allocator_ != nullptr) {
@@ -54,7 +54,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t   cublas_handle,

 cublasFP8MMWrapper::~cublasFP8MMWrapper()
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_ = nullptr;
    if (allocator_ != nullptr) {
        allocator_->free((void**)(&cublas_workspace_qgemm_));
@@ -69,7 +69,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
                    wrapper.mu_,
                    wrapper.allocator_)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    cublasVersionCheck();
 }

@@ -135,7 +135,7 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
                              cudaStream_t         stream,
                              bool                 fastAccum)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();

    const void*  devAscalePtr = (const void*)kernel_scale;
@@ -345,7 +345,7 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
                              cudaStream_t         stream,
                              bool                 fastAccum)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();

    const void* devAscalePtr = (const void*)kernel_scale;
@@ -534,7 +534,7 @@ void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3*       res,
                                     const float          output_scale,
                                     cudaStream_t         stream)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();
    size_t workspace_size = 0;
    // get workspace size
@@ -615,7 +615,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16*       res,
                                       const float*         output_scale,
                                       cudaStream_t         stream)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();

    const void*  devAscalePtr = (const void*)kernel_scale;
@@ -777,7 +777,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3*       res,
                                       const float*         output_scale,
                                       cudaStream_t         stream)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();

    const void*  devAscalePtr = (const void*)kernel_scale;
@@ -1018,4 +1018,4 @@ template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3*
                                                              const float*         output_scale,
                                                              cudaStream_t         stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasFP8MMWrapper.h
+++ b/src/turbomind/utils/cublasFP8MMWrapper.h
@@ -16,9 +16,9 @@

 #include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
 #include "cuda_utils.h"
-#include "src/fastertransformer/utils/cublasAlgoMap.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cublasAlgoMap.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -28,7 +28,7 @@

 #pragma once

-namespace fastertransformer {
+namespace turbomind {

 class cublasFP8MMWrapper: public cublasMMWrapper {
 public:
@@ -170,8 +170,8 @@ public:

 private:
    int                                 version_major_, version_minor_, version_patch_;
-    fastertransformer::qgmma1x1Launcher qgmmaLauncher;
+    turbomind::qgmma1x1Launcher qgmmaLauncher;
    void*                               cublas_workspace_qgemm_ = nullptr;
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasINT8MMWrapper.cc
+++ b/src/turbomind/utils/cublasINT8MMWrapper.cc
@@ -20,7 +20,7 @@
 #error CUDART_VERSION Undefined!
 #endif

-namespace fastertransformer {
+namespace turbomind {
 cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
                                         cudaStream_t     stream,
                                         cublasAlgoMap*   cublas_algo_map,
@@ -556,4 +556,4 @@ void cublasINT8MMWrapper::SpGemm(
    mu_->unlock();
 }
 #endif
-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasINT8MMWrapper.h
+++ b/src/turbomind/utils/cublasINT8MMWrapper.h
@@ -15,9 +15,9 @@
 */

 #include "cuda_utils.h"
-#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-#include "src/fastertransformer/utils/cublasAlgoMap.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
+#include "src/turbomind/utils/cublasAlgoMap.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -26,7 +26,7 @@
 #include <string>

 #pragma once
-namespace fastertransformer {
+namespace turbomind {

 class cublasINT8MMWrapper: public cublasMMWrapper {
 private:
@@ -91,4 +91,4 @@ public:
 #endif
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasMMWrapper.cc
+++ b/src/turbomind/utils/cublasMMWrapper.cc
@@ -21,7 +21,7 @@
 #error CUDART_VERSION Undefined!
 #endif

-namespace fastertransformer {
+namespace turbomind {
 cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
                                 cublasLtHandle_t cublaslt_handle,
                                 cudaStream_t     stream,
@@ -35,7 +35,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
    mu_(mu),
    allocator_(allocator)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
    }
@@ -57,7 +57,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,
    mu_(mu),
    allocator_(allocator)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
    }
@@ -66,7 +66,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,

 cublasMMWrapper::~cublasMMWrapper()
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_ = nullptr;
    if (allocator_ != nullptr) {
        allocator_->free((void**)(&cublas_workspace_));
@@ -85,7 +85,7 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
    mu_(wrapper.mu_),
    allocator_(wrapper.allocator_)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
    }
@@ -110,7 +110,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                           cudaDataType_t    computeType,
                           cublasGemmAlgo_t  algo)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    mu_->lock();
    check_cuda_error(cublasGemmEx(cublas_handle_,
                                  transa,
@@ -147,7 +147,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                           void*             C,
                           const int         ldc)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
 }

@@ -165,7 +165,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                           float             f_alpha,
                           float             f_beta)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    half h_alpha = (half)(f_alpha);
    half h_beta  = (half)(f_beta);

@@ -396,7 +396,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                           void*             C,
                           const int         ldc)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    cudaDataType_t      Atype, Btype, Ctype;
    cublasComputeType_t computeType;
    cudaDataType_t      scaleType;
@@ -1099,4 +1099,4 @@ void cublasMMWrapper::Int8Gemm(const int     m,
    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
 }

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cublasMMWrapper.h
+++ b/src/turbomind/utils/cublasMMWrapper.h
@@ -15,8 +15,8 @@
 */

 #include "cuda_utils.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasAlgoMap.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -25,7 +25,7 @@
 #include <string>

 #pragma once
-namespace fastertransformer {
+namespace turbomind {

 class cublasMMWrapper {
 protected:
@@ -293,4 +293,4 @@ public:
 #endif
 };

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cuda_bf16_fallbacks.cuh
+++ b/src/turbomind/utils/cuda_bf16_fallbacks.cuh
@@ -16,10 +16,10 @@

 #pragma once

-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include <cuda_fp16.h>

-namespace fastertransformer {
+namespace turbomind {

 #ifdef ENABLE_BF16
 inline __device__ float2 bf1622float2(const __nv_bfloat162 val)
@@ -287,4 +287,4 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _

 #endif  // ENABLE_BF16

-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cuda_fp8_utils.cu
+++ b/src/turbomind/utils/cuda_fp8_utils.cu
@@ -16,7 +16,7 @@

 #include "cuda_fp8_utils.h"

-namespace fastertransformer {
+namespace turbomind {
 #ifdef ENABLE_FP8

 template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
@@ -121,4 +121,4 @@ template void
 invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);

 #endif  // ENABLE_FP8
-}  // namespace fastertransformer
+}  // namespace turbomind
--- a/src/turbomind/utils/cuda_fp8_utils.h
+++ b/src/turbomind/utils/cuda_fp8_utils.h
@@ -31,7 +31,7 @@
 #define USE_QGMMA
 #endif

-namespace fastertransformer {
+namespace turbomind {

 const float FP8_E4M3_MAX = 480.0f;

@@ -190,5 +190,5 @@ void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_
 template<typename T_W>
 void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);

-}  // namespace fastertransformer
+}  // namespace turbomind
 #endif  // ENABLE_FP8
--- a/src/turbomind/utils/cuda_type_utils.cuh
+++ b/src/turbomind/utils/cuda_type_utils.cuh
@@ -16,13 +16,13 @@

 #pragma once

-#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <cuda.h>
 #include <cuda_fp16.h>

-namespace fastertransformer {
+namespace turbomind {

 template<typename T>
 inline __device__ T ldg(const T* val)
@@ -598,4 +598,4 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)

 #endif  // ENABLE_FP8

-}  // namespace fastertransformer
+}  // namespace turbomind