build turbomind (#35)

* build turbomind * change namespace fastertransformer to turbomind * change logger name

build turbomind (#35)
* build turbomind * change namespace fastertransformer to turbomind * change logger name
35d64462 · lvhan028 · GitHub · 53d2e42c · 35d64462 · 35d64462
Unverified Commit 35d64462 authored Jul 01, 2023 by lvhan028 Committed by GitHub Jul 01, 2023
16 changed files
--- a/tests/gemm_dequantize/th_gemm_dequantize.cc
+++ b/tests/gemm_dequantize/th_gemm_dequantize.cc
@@ -22,9 +22,9 @@
 #include <torch/custom_class.h>
 #include <torch/script.h>

-#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
-#include "src/fastertransformer/th_op/th_utils.h"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/turbomind/th_op/th_utils.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"

 #include "cutlass/numeric_types.h"

@@ -32,7 +32,7 @@ using torch::Tensor;

 namespace torch_ext {

-namespace ft = fastertransformer;
+namespace ft = turbomind;

 template<typename T, typename WeightType>
 Tensor fused_gemm_dq_helper(
@@ -48,7 +48,7 @@ Tensor fused_gemm_dq_helper(
    const WeightType* weight_ptr    = get_ptr<const WeightType>(weight);
    const T*          scales_ptr    = get_ptr<const T>(scales);

-    fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
+    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);

    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
@@ -152,7 +152,7 @@ Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales)
 Tensor
 bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time)
 {
-    using namespace fastertransformer;
+    using namespace turbomind;
    const int m = input_activations.size(0);
    const int n = weight_dequantized.size(1);
    const int k = input_activations.size(1);
@@ -257,7 +257,7 @@ Tensor fused_gemm_dq_bias_act_helper(
    const T*          scales_ptr    = get_ptr<const T>(scales);
    const T*          bias_ptr      = get_ptr<const T>(bias);

-    fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
+    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);

    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));

--- a/tests/int8_gemm/int8_gemm_test.cu
+++ b/tests/int8_gemm/int8_gemm_test.cu
@@ -24,17 +24,17 @@
 #include <torch/custom_class.h>
 #include <torch/script.h>

-#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
-#include "src/fastertransformer/th_op/th_utils.h"
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/turbomind/th_op/th_utils.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/logger.h"

 #include "cutlass/numeric_types.h"

 using torch::Tensor;
 using torch_ext::get_ptr;

-namespace ft = fastertransformer;
+namespace ft = turbomind;

 template<typename T>
 void int8_gemm_test(
@@ -143,9 +143,9 @@ void int8_gemm_test(
    auto duration = duration_cast<microseconds>(end - start);

    if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
-        FT_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
+        TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
    } else {
-        FT_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
+        TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
    }
 }
@@ -153,7 +153,7 @@ void int8_gemm_test(
 int main(int argc, char **argv)
 {
    if (argc != 7) {
-        FT_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
+        TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
        return 0;
    }


--- a/tests/unittests/gtest_utils.h
+++ b/tests/unittests/gtest_utils.h
@@ -8,12 +8,12 @@
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>

-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/logger.h"

-namespace ft = fastertransformer;
+namespace ft = turbomind;

 namespace {

@@ -48,11 +48,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
        bool ok = almostEqual(a, b, atol, rtol);
        // Print the error.
        if (!ok && failures < 4) {
-            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
-            FT_LOG_ERROR(">>    found......: %10.6f", a);
-            FT_LOG_ERROR(">>    expected...: %10.6f", b);
-            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
-            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+            TM_LOG_ERROR(">> invalid result for i=%lu:", i);
+            TM_LOG_ERROR(">>    found......: %10.6f", a);
+            TM_LOG_ERROR(">>    expected...: %10.6f", b);
+            TM_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            TM_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
        }
        // Update the number of failures.
        failures += ok ? 0 : 1;
@@ -65,7 +65,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * size);
    if (failures > tol_failures) {
-        FT_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
+        TM_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
                     name.c_str(), 100. * failures / size, atol, rtol, 100. * relative_gap);
    }
    return failures <= tol_failures;

--- a/tests/unittests/test_activation.cu
+++ b/tests/unittests/test_activation.cu
@@ -2,14 +2,14 @@
 #include <string>     // std::string
 #include <vector>     // std::vector

-#include "src/fastertransformer/kernels/activation_kernels.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/logger.h"

 #include "unittest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 #define PRINT_LIMIT 16
 #define EPSILON (1e-20)
@@ -30,7 +30,7 @@ struct TestCase {

    void print()
    {
-        FT_LOG_INFO(toString());
+        TM_LOG_INFO(toString());
    }
 };

@@ -107,7 +107,7 @@ void testActivationKernel(TestCase tc)
        invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
    }
    float total_time_opt = cuda_timer_opt.stop();
-    FT_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
+    TM_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
                tc.toString().c_str(),
                total_time_baseline / ite * 1000.f,
                total_time_opt / ite * 1000.f,
@@ -148,7 +148,7 @@ int main()
        // testActivationKernel<float>(tc);
        testActivationKernel<half>(tc);
    }
-    FT_LOG_INFO("testActivationKernel done");
+    TM_LOG_INFO("testActivationKernel done");

    return 0;
 }
--- a/tests/unittests/test_attention_kernels.cu
+++ b/tests/unittests/test_attention_kernels.cu
@@ -15,19 +15,19 @@
 */

 #include "tests/unittests/gtest_utils.h"
-#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
-#include "src/fastertransformer/kernels/gpt_kernels.h"
-#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
+#include "src/turbomind/kernels/gen_relative_pos_bias.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/kernels/unfused_attention_kernels.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
+#include "src/turbomind/utils/Tensor.h"

 #include <curand.h>
 #include <sstream>
 #include <stdexcept>
 #include <vector>

-using namespace fastertransformer;
+using namespace turbomind;

 namespace {


--- a/tests/unittests/test_context_attention_layer.cu
+++ b/tests/unittests/test_context_attention_layer.cu
@@ -10,17 +10,17 @@
 #include <thrust/host_vector.h>
 #include <thrust/transform.h>

-#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
-#include "src/fastertransformer/models/llama/llama_kernels.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/logger.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/bert_preprocess_kernels.h"
+#include "src/turbomind/kernels/unfused_attention_kernels.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"
 #include "unittest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 template<typename scalar_t>
 __global__ void pad_query_kernel(
@@ -216,7 +216,7 @@ static const char* usage = "Usage: %s <batch-size> <num-heads> <key-len> <query-

 int main(int argc, const char* argv[])
 {
-    using namespace fastertransformer;
+    using namespace turbomind;
    using scalar_t                            = half;
    static const cudaDataType_t kCudaDataType = std::is_same<scalar_t, half>::value ? CUDA_R_16F : CUDA_R_32F;


--- a/tests/unittests/test_gemm.cu
+++ b/tests/unittests/test_gemm.cu
@@ -6,15 +6,15 @@
 #include <tuple>
 #include <vector>

-#include "src/fastertransformer/layers/DenseWeight.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/gemm.h"
-#include "src/fastertransformer/utils/logger.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/layers/DenseWeight.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/gemm.h"
+#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 // Can be replaced by the function provided by a test framework

@@ -33,7 +33,7 @@ public:

 #define EXPECT_TRUE(cond)                           \
    do { if(!(cond)) {                              \
-        FT_LOG_ERROR("TEST FAIL [%s] at %s:%d",     \
+        TM_LOG_ERROR("TEST FAIL [%s] at %s:%d",     \
                     __func__, __FILE__, __LINE__); \
        throw TestFailureError(__func__);           \
    } } while(false)
@@ -42,7 +42,7 @@ public:
    do {                                                        \
        bool is_ok = checkResult<dtype,ctype>(name, out, ref);  \
        if(!is_ok) {                                            \
-            FT_LOG_ERROR("TEST FAIL [%s] at %s:%d",             \
+            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d",             \
                        __func__, __FILE__, __LINE__);          \
            throw TestFailureError(__func__);                   \
        }                                                       \
@@ -81,7 +81,7 @@ public:
    TensorWrapper(TensorWrapper const& other)
        : allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
    {
-        FT_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
+        TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
    }
    ~TensorWrapper()
    {
@@ -220,11 +220,11 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
        bool ok = almostEqual(a, b, atol, rtol);
        // Print the error.
        if( !ok && failures < 4 ) {
-            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
-            FT_LOG_ERROR(">>    found......: %10.6f", a);
-            FT_LOG_ERROR(">>    expected...: %10.6f", b);
-            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
-            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+            TM_LOG_ERROR(">> invalid result for i=%lu:", i);
+            TM_LOG_ERROR(">>    found......: %10.6f", a);
+            TM_LOG_ERROR(">>    expected...: %10.6f", b);
+            TM_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            TM_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
        }

        // Update the number of failures.
@@ -233,7 +233,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa

    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * out_size);
-    FT_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
+    TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
                name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
                100. * failures / out_size, atol, rtol);
    return failures <= tol_failures;
@@ -306,7 +306,7 @@ static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs

 template<typename T, DataType computeType>
 void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
-    FT_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
                m, n, k, toString<T, computeType>().c_str());
    cudaStream_t stream;
    check_cuda_error(cudaStreamCreate(&stream));
@@ -324,7 +324,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {

    for (auto &op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        FT_LOG_DEBUG(tc_name);
+        TM_LOG_DEBUG(tc_name);
        computeReference<computeType>(op_pair.transa, op_pair.transb,
                                      expected, a_tensor, b_tensor);

@@ -362,7 +362,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
 template<typename T, DataType computeType>
 void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
    // Test if Gemm is consistent with cublasWrapper
-    FT_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
                m, n, k, toString<T, computeType>().c_str());

    Allocator<AllocatorType::CUDA> allocator(getDevice());
@@ -444,7 +444,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
 template<typename T, DataType computeType>
 void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
    // Test if Gemm is consistent with cublasWrapper
-    FT_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
                m, n, k, toString<T, computeType>().c_str());

    Allocator<AllocatorType::CUDA> allocator(getDevice());
@@ -514,7 +514,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {

    for (auto &op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        FT_LOG_DEBUG(tc_name);
+        TM_LOG_DEBUG(tc_name);

        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
@@ -578,7 +578,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
 template<typename T, DataType computeType>
 void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
    // Test if Gemm is consistent with cublasWrapper
-    FT_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
                batch_size, m, n, k, toString<T, computeType>().c_str());

    Allocator<AllocatorType::CUDA> allocator(getDevice());
@@ -693,7 +693,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
 // but let us keep these template variables for later use.
 template<typename T, DataType computeType>
 void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
-    FT_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
                m, n, k, toString<T, computeType>().c_str());
    cudaStream_t stream;
    check_cuda_error(cudaStreamCreate(&stream));
@@ -712,7 +712,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
    for (auto &op_pair : op_pairs) {
        // A/B will be switched in SpGemm.
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        FT_LOG_DEBUG(tc_name);
+        TM_LOG_DEBUG(tc_name);

        b_tensor.setRandomValues();
        pruneMatrixB(b_tensor.data, stream,
@@ -763,7 +763,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
 template<typename T, DataType computeType>
 void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
    // Test if Gemm is consistent with cublasWrapper
-    FT_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+    TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
                m, n, k, toString<T, computeType>().c_str());

    Allocator<AllocatorType::CUDA> allocator(getDevice());
@@ -799,7 +799,7 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {

    for (auto &op_pair : op_pairs) {
        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        FT_LOG_DEBUG(tc_name);
+        TM_LOG_DEBUG(tc_name);

        b_tensor.setRandomValues();
        pruneMatrixB(b_tensor.data, stream,
@@ -904,6 +904,6 @@ int main(int argc, char* argv[]) {
        testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
    }
 #endif
-    FT_LOG_INFO("Test done");
+    TM_LOG_INFO("Test done");
    return 0;
 }
--- a/tests/unittests/test_gpt_kernels.cu
+++ b/tests/unittests/test_gpt_kernels.cu
 #include <vector>
 #include <random>

-#include "src/fastertransformer/kernels/gpt_kernels.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/utils/memory_utils.h"

 #include "unittest_utils.h"


--- a/tests/unittests/test_int8.cu
+++ b/tests/unittests/test_int8.cu
@@ -5,10 +5,10 @@
 #include <string>
 #include <vector>

-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/kernels/transpose_int8_kernels.h"

 #include <algorithm>
 #include <iostream>
@@ -16,7 +16,7 @@

 #include "tests/unittests/gtest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 class Int8TestSuite: public FtTestBase {


--- a/tests/unittests/test_logprob_kernels.cu
+++ b/tests/unittests/test_logprob_kernels.cu
@@ -6,15 +6,15 @@
 #include <vector>
 #include <sys/time.h>

-#include "src/fastertransformer/kernels/logprob_kernels.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/logger.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/logprob_kernels.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"

 #include "tests/unittests/gtest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 ////////////////////////////////////////////////////////////////////////////////////

@@ -182,7 +182,7 @@ public:
        bool passed = checkResult(param.toString(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
        EXPECT_TRUE(passed);

-        FT_LOG_DEBUG("free host buffers");
+        TM_LOG_DEBUG("free host buffers");
        delete[] expected_cum_log_probs;
        delete[] h_input_lengths;
        delete[] h_input_ids;

--- a/tests/unittests/test_penalty_kernels.cu
+++ b/tests/unittests/test_penalty_kernels.cu
@@ -27,16 +27,16 @@
 #include <cublasLt.h>
 #include <cuda_runtime.h>

-#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
-#include "src/fastertransformer/kernels/penalty_types.h"
-#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
+#include "src/turbomind/kernels/penalty_types.h"
+#include "src/turbomind/kernels/sampling_penalty_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"

 // #include "tests/unittests/unittest_utils.h"
 #include "tests/unittests/gtest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 struct TemperatureTestParam {
    size_t batch_size;

--- a/tests/unittests/test_sampling.cu
+++ b/tests/unittests/test_sampling.cu
@@ -9,17 +9,17 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>

-#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
-#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
-#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/sampling_topk_kernels.h"
+#include "src/turbomind/layers/DynamicDecodeLayer.h"
+#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"

 #include "tests/unittests/unittest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 struct TestCase {
    std::string name;
@@ -48,7 +48,7 @@ struct TestCase {

    void print()
    {
-        FT_LOG_INFO(toString());
+        TM_LOG_INFO(toString());
    }
 };

@@ -157,11 +157,11 @@ void testCumLogProbComputation(TestCase tc)
    memset(expected_cum_log_probs, 0, sizeof(float) * batch_size * beam_width);

 #ifndef NDEBUG
-    FT_LOG_DEBUG("logit values");
+    TM_LOG_DEBUG("logit values");
    printMatrixWithLimit(h_logits, batch_size * beam_width, vocab_size, vocab_size, false);
-    FT_LOG_DEBUG("\nprob values");
+    TM_LOG_DEBUG("\nprob values");
    printMatrixWithLimit(h_probs, batch_size * beam_width, vocab_size, vocab_size, false);
-    FT_LOG_DEBUG("\nlog-prob values");
+    TM_LOG_DEBUG("\nlog-prob values");
    printMatrixWithLimit(h_log_probs, batch_size * beam_width, vocab_size, vocab_size, false);
 #endif

@@ -224,7 +224,7 @@ void testCumLogProbComputation(TestCase tc)

        dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);

-        FT_LOG_DEBUG("Step %2d generated ids", step);
+        TM_LOG_DEBUG("Step %2d generated ids", step);
        cudaD2Hcpy(
            h_output_ids,
            (int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * (batch_size * beam_width)),
@@ -234,7 +234,7 @@ void testCumLogProbComputation(TestCase tc)
        for (size_t i = 0; i < batch_size * beam_width; ++i) {
            int idx = i * vocab_size + h_output_ids[i];
            expected_cum_log_probs[i] += (float)h_log_probs[idx];
-            FT_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
+            TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
                         "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
                         (int)step,
                         (int)i,
@@ -246,7 +246,7 @@ void testCumLogProbComputation(TestCase tc)
                         expected_cum_log_probs[i],
                         (float)h_probs[idx]);
        }
-        FT_LOG_DEBUG("");
+        TM_LOG_DEBUG("");

 #ifndef NDEBUG
        // print output ids
@@ -285,10 +285,10 @@ void testCumLogProbComputation(TestCase tc)

 void printTensors(TensorMap* map, size_t limit = 8)
 {
-    FT_LOG_INFO("Tensors:");
+    TM_LOG_INFO("Tensors:");
    for (auto& kv : *map) {
        Tensor t = kv.second;
-        FT_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
+        TM_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
    }
 }

@@ -468,13 +468,13 @@ private:
                    for (auto& expt : expts) {
                        ss << " " << expt;
                    }
-                    FT_LOG_DEBUG("%s", ss.str().c_str());
+                    TM_LOG_DEBUG("%s", ss.str().c_str());
                }
                ++failures;
            }
        }
        delete[] h_output_ids;
-        FT_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
+        TM_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
                     failures == 0 ? "....OK" : "FAILED",
                     name.c_str(),
                     failures,
@@ -491,7 +491,7 @@ private:
                      float*                     temperature,
                      float*                     repetition_penalty)
    {
-        FT_LOG_INFO("Test %s", name.c_str());
+        TM_LOG_INFO("Test %s", name.c_str());
        std::string tag    = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
        bool        passed = true;
        for (unsigned long long seed = 0; seed < max_seed; ++seed) {
@@ -518,7 +518,7 @@ private:
            passed &= is_ok;
 #ifndef NDEBUG
            if (!is_ok) {
-                FT_LOG_ERROR("actual output ids");
+                TM_LOG_ERROR("actual output ids");
                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
            }
 #endif
@@ -526,7 +526,7 @@ private:
            delete input_tensors;
            this->teardown();
        }
-        FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
+        TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
        return passed;
    }

@@ -539,7 +539,7 @@ private:
                                    float*                     temperature,
                                    float*                     repetition_penalty)
    {
-        FT_LOG_INFO("Test %s", name.c_str());
+        TM_LOG_INFO("Test %s", name.c_str());
        std::string tag    = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
        bool        passed = true;
        size_t      local_batch_size = 2;
@@ -567,7 +567,7 @@ private:
            passed &= is_ok;
 #ifndef NDEBUG
            if (!is_ok) {
-                FT_LOG_ERROR("actual output ids");
+                TM_LOG_ERROR("actual output ids");
                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
            }
 #endif
@@ -575,7 +575,7 @@ private:
            delete input_tensors;
            this->teardown();
        }
-        FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
+        TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
        return passed;
    }

@@ -1229,7 +1229,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size)
    for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
        for (size_t j = 1; j < period_size; ++j) {
            if (vals[i] != vals[i + j]) {
-                FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
+                TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
                return false;
            }
        }
@@ -1244,7 +1244,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size, siz
    for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
        for (size_t j = 1; j < period_size; ++j) {
            if (j != except && vals[i] != vals[i + j]) {
-                FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
+                TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
                return false;
            }
        }
@@ -1284,7 +1284,7 @@ void testCuandBatchInitialize(const size_t batch_size)

    // The same seed produces the same random number.
    bool passed = isEqualInPeriod(h_rand_vals, batch_size, period_size);
-    FT_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
+    TM_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
    EXPECT_TRUE(passed);

    delete h_rand_vals;
@@ -1299,7 +1299,7 @@ void testCuandBatchInitialize(const size_t batch_size)
 template<typename T, bool SINGLE_RANDOM_SEED, bool HAS_DIFF_ARGS, bool USE_LOCAL_BATCH>
 void testSamplingLayerCurandInit(TestCase tc)
 {
-    FT_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
+    TM_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
    const DataType data_type = getTensorType<T>();

    const size_t beam_width = 1;
@@ -1376,7 +1376,7 @@ void testSamplingLayerCurandInit(TestCase tc)
    deviceFill(d_end_id_buf, batch_size, end_id);

 #ifndef NDEBUG
-    FT_LOG_DEBUG("Random Seeds");
+    TM_LOG_DEBUG("Random Seeds");
    printMatrixWithLimit(random_seed, 1, random_seed_size, random_seed_size, false);
 #endif

@@ -1400,7 +1400,7 @@ void testSamplingLayerCurandInit(TestCase tc)
        cudaH2Dcpy(d_logits_buf, h_logits, batchxbeam * vocab_size);

 #ifndef NDEBUG
-        FT_LOG_DEBUG("logit values");
+        TM_LOG_DEBUG("logit values");
        printMatrixWithLimit(h_logits, batchxbeam, vocab_size, vocab_size, false);
 #endif
        for (uint ite = 0; ite < iteration_num; ++ite) {
@@ -1434,9 +1434,9 @@ void testSamplingLayerCurandInit(TestCase tc)
            dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
            sync_check_cuda_error();
 #ifndef NDEBUG
-            FT_LOG_DEBUG("Step %2d generated ids", step);
+            TM_LOG_DEBUG("Step %2d generated ids", step);
            printMatrix(d_output_ids, max_seq_len, batchxbeam, batchxbeam, true);
-            FT_LOG_DEBUG("");
+            TM_LOG_DEBUG("");
 #endif
            // check results.
            cudaD2Hcpy(h_output_ids,
@@ -1452,7 +1452,7 @@ void testSamplingLayerCurandInit(TestCase tc)
                             HAS_DIFF_ARGS ? "true" : "false",
                             USE_LOCAL_BATCH ? "true" : "false",
                             (std::is_same<T, float>::value ? " fp32" : " fp16"));
-    FT_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
+    TM_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
    EXPECT_TRUE(passed);

    free(h_logits);
@@ -1495,13 +1495,13 @@ int main()
        testCumLogProbComputation<float>(tc);
        testCumLogProbComputation<half>(tc);
    }
-    FT_LOG_INFO("testCumLogProbComputation done");
+    TM_LOG_INFO("testCumLogProbComputation done");

    SamplingDecodeTest<float> sampling_decode_test;
    sampling_decode_test.testAll();

    testCuandBatchInitialize(127);
-    FT_LOG_INFO("testCuandBatchInitialize done");
+    TM_LOG_INFO("testCuandBatchInitialize done");

 #define LAUNCH_VARIANTS(T, tc, local_batch)                                                                            \
    testSamplingLayerCurandInit<T, true, false, local_batch>(tc);                                                      \
@@ -1515,7 +1515,7 @@ int main()
        LAUNCH_VARIANTS(half, tc, true);
    }
 #undef LAUNCH_VARIANTS
-    FT_LOG_INFO("testSamplingLayerCurandInit done");
+    TM_LOG_INFO("testSamplingLayerCurandInit done");

    return 0;
 }
--- a/tests/unittests/test_sampling_kernels.cu
+++ b/tests/unittests/test_sampling_kernels.cu
@@ -10,18 +10,18 @@
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>

-#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
-#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
-#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
-#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/sampling_topk_kernels.h"
+#include "src/turbomind/kernels/sampling_topp_kernels.h"
+#include "src/turbomind/layers/DynamicDecodeLayer.h"
+#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"

 #include "tests/unittests/gtest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 namespace {


--- a/tests/unittests/test_sampling_layer.cu
+++ b/tests/unittests/test_sampling_layer.cu
@@ -9,18 +9,18 @@
 #include <cublasLt.h>
 #include <cuda_runtime.h>

-#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
-#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
-#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
-#include "src/fastertransformer/utils/cublasMMWrapper.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/Tensor.h"
+#include "src/turbomind/kernels/sampling_topk_kernels.h"
+#include "src/turbomind/layers/DynamicDecodeLayer.h"
+#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/Tensor.h"

 // #include "tests/unittests/unittest_utils.h"
 #include "tests/unittests/gtest_utils.h"

-using namespace fastertransformer;
+using namespace turbomind;

 struct SamplingLayerTestParam {
    size_t batch_size;
@@ -256,12 +256,12 @@ protected:
                    for (auto& expt : expts) {
                        ss << " " << expt;
                    }
-                    FT_LOG_DEBUG("%s", ss.str().c_str());
+                    TM_LOG_DEBUG("%s", ss.str().c_str());
                }
                ++failures;
            }
        }
-        FT_LOG_DEBUG("check...%6s : failures: %d / %d",
+        TM_LOG_DEBUG("check...%6s : failures: %d / %d",
                     failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
        delete[] h_output_ids;
        return failures == 0;
@@ -302,7 +302,7 @@ public:
            EXPECT_TRUE(passed) << "Failed at seed " << seed;
 #ifndef NDEBUG
            if (!passed) {
-                FT_LOG_ERROR("actual output ids");
+                TM_LOG_ERROR("actual output ids");
                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
            }
 #endif
@@ -867,7 +867,7 @@ protected:
            dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
                                        &dynamic_decode_input_tensors);

-            FT_LOG_DEBUG("Step %2d generated ids", step);
+            TM_LOG_DEBUG("Step %2d generated ids", step);
            cudaD2Hcpy(h_output_ids,
                       dynamic_decode_output_tensors
                           .at("output_ids")
@@ -878,14 +878,14 @@ protected:
            for (size_t i = 0; i < batch_size * beam_width; ++i) {
                int idx = i * vocab_size + h_output_ids[i];
                expected_cum_log_probs[i] += (float)h_log_probs[idx];
-                FT_LOG_DEBUG(
+                TM_LOG_DEBUG(
                    "| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
                    "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
                    (int)step, (int)i, (int)idx, (int)h_output_ids[i],
                    h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
                    h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
            }
-            FT_LOG_DEBUG("");
+            TM_LOG_DEBUG("");
        }

        bool passed = checkResult(param.toString(), cum_log_probs, expected_cum_log_probs, batch_size * beam_width);

--- a/tests/unittests/test_tensor.cu
+++ b/tests/unittests/test_tensor.cu
@@ -4,9 +4,9 @@

 #include <gtest/gtest.h>

-#include "src/fastertransformer/utils/Tensor.h"
+#include "src/turbomind/utils/Tensor.h"

-using namespace fastertransformer;
+using namespace turbomind;

 namespace {


--- a/tests/unittests/unittest_utils.h
+++ b/tests/unittests/unittest_utils.h
@@ -26,15 +26,15 @@
 #include <string>      // string
 #include <vector>      // vector

-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include "src/fastertransformer/utils/string_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/string_utils.h"

 #define PRINT_LIMIT 16
 #define EPSILON (1e-20)
 #define EPSILON_FP16 (1e-10)

-using namespace fastertransformer;
+using namespace turbomind;

 class TestFailureError : public std::exception {
 private:
@@ -51,14 +51,14 @@ public:

 #define EXPECT_TRUE(cond)                                  \
    do { if(!(cond)) {                                     \
-        FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
                     __func__, #cond, __FILE__, __LINE__); \
        throw TestFailureError(__func__);                  \
    } } while(false)

 #define EXPECT_FALSE(cond)                                 \
    do { if(cond) {                                        \
-        FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+        TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
                     __func__, #cond, __FILE__, __LINE__); \
        throw TestFailureError(__func__);                  \
    } } while(false)
@@ -92,11 +92,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
        bool ok = almostEqual(a, b, atol, rtol);
        // Print the error.
        if (!ok && failures < 4) {
-            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
-            FT_LOG_ERROR(">>    found......: %10.6f", a);
-            FT_LOG_ERROR(">>    expected...: %10.6f", b);
-            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
-            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+            TM_LOG_ERROR(">> invalid result for i=%lu:", i);
+            TM_LOG_ERROR(">>    found......: %10.6f", a);
+            TM_LOG_ERROR(">>    expected...: %10.6f", b);
+            TM_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            TM_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
        }
        // Update the number of failures.
        failures += ok ? 0 : 1;
@@ -108,7 +108,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float

    // Allow not matched up to 1% elements.
    size_t tol_failures = (size_t)(0.01 * size);
-    FT_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
+    TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
                failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
                100. * failures / size, atol, rtol, 100. * relative_gap);
    return failures <= tol_failures;