Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -22,9 +22,9 @@ ...@@ -22,9 +22,9 @@
#include <torch/custom_class.h> #include <torch/custom_class.h>
#include <torch/script.h> #include <torch/script.h>
#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" #include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/fastertransformer/th_op/th_utils.h" #include "src/turbomind/th_op/th_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "cutlass/numeric_types.h" #include "cutlass/numeric_types.h"
...@@ -32,7 +32,7 @@ using torch::Tensor; ...@@ -32,7 +32,7 @@ using torch::Tensor;
namespace torch_ext { namespace torch_ext {
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T, typename WeightType> template<typename T, typename WeightType>
Tensor fused_gemm_dq_helper( Tensor fused_gemm_dq_helper(
...@@ -48,7 +48,7 @@ Tensor fused_gemm_dq_helper( ...@@ -48,7 +48,7 @@ Tensor fused_gemm_dq_helper(
const WeightType* weight_ptr = get_ptr<const WeightType>(weight); const WeightType* weight_ptr = get_ptr<const WeightType>(weight);
const T* scales_ptr = get_ptr<const T>(scales); const T* scales_ptr = get_ptr<const T>(scales);
fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
...@@ -152,7 +152,7 @@ Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales) ...@@ -152,7 +152,7 @@ Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales)
Tensor Tensor
bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time) bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time)
{ {
using namespace fastertransformer; using namespace turbomind;
const int m = input_activations.size(0); const int m = input_activations.size(0);
const int n = weight_dequantized.size(1); const int n = weight_dequantized.size(1);
const int k = input_activations.size(1); const int k = input_activations.size(1);
...@@ -257,7 +257,7 @@ Tensor fused_gemm_dq_bias_act_helper( ...@@ -257,7 +257,7 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* scales_ptr = get_ptr<const T>(scales); const T* scales_ptr = get_ptr<const T>(scales);
const T* bias_ptr = get_ptr<const T>(bias); const T* bias_ptr = get_ptr<const T>(bias);
fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
......
...@@ -24,17 +24,17 @@ ...@@ -24,17 +24,17 @@
#include <torch/custom_class.h> #include <torch/custom_class.h>
#include <torch/script.h> #include <torch/script.h>
#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" #include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/fastertransformer/th_op/th_utils.h" #include "src/turbomind/th_op/th_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "cutlass/numeric_types.h" #include "cutlass/numeric_types.h"
using torch::Tensor; using torch::Tensor;
using torch_ext::get_ptr; using torch_ext::get_ptr;
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T> template<typename T>
void int8_gemm_test( void int8_gemm_test(
...@@ -143,9 +143,9 @@ void int8_gemm_test( ...@@ -143,9 +143,9 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start); auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) { if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
FT_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else { } else {
FT_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl; // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
} }
} }
...@@ -153,7 +153,7 @@ void int8_gemm_test( ...@@ -153,7 +153,7 @@ void int8_gemm_test(
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 7) { if (argc != 7) {
FT_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters."); TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0; return 0;
} }
......
...@@ -8,12 +8,12 @@ ...@@ -8,12 +8,12 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
namespace ft = fastertransformer; namespace ft = turbomind;
namespace { namespace {
...@@ -48,11 +48,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -48,11 +48,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
bool ok = almostEqual(a, b, atol, rtol); bool ok = almostEqual(a, b, atol, rtol);
// Print the error. // Print the error.
if (!ok && failures < 4) { if (!ok && failures < 4) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i); TM_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a); TM_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b); TM_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b)); TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b)); TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
} }
// Update the number of failures. // Update the number of failures.
failures += ok ? 0 : 1; failures += ok ? 0 : 1;
...@@ -65,7 +65,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -65,7 +65,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size); size_t tol_failures = (size_t)(0.01 * size);
if (failures > tol_failures) { if (failures > tol_failures) {
FT_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)", TM_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
name.c_str(), 100. * failures / size, atol, rtol, 100. * relative_gap); name.c_str(), 100. * failures / size, atol, rtol, 100. * relative_gap);
} }
return failures <= tol_failures; return failures <= tol_failures;
......
...@@ -2,14 +2,14 @@ ...@@ -2,14 +2,14 @@
#include <string> // std::string #include <string> // std::string
#include <vector> // std::vector #include <vector> // std::vector
#include "src/fastertransformer/kernels/activation_kernels.h" #include "src/turbomind/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "unittest_utils.h" #include "unittest_utils.h"
using namespace fastertransformer; using namespace turbomind;
#define PRINT_LIMIT 16 #define PRINT_LIMIT 16
#define EPSILON (1e-20) #define EPSILON (1e-20)
...@@ -30,7 +30,7 @@ struct TestCase { ...@@ -30,7 +30,7 @@ struct TestCase {
void print() void print()
{ {
FT_LOG_INFO(toString()); TM_LOG_INFO(toString());
} }
}; };
...@@ -107,7 +107,7 @@ void testActivationKernel(TestCase tc) ...@@ -107,7 +107,7 @@ void testActivationKernel(TestCase tc)
invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream); invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
} }
float total_time_opt = cuda_timer_opt.stop(); float total_time_opt = cuda_timer_opt.stop();
FT_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)", TM_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
tc.toString().c_str(), tc.toString().c_str(),
total_time_baseline / ite * 1000.f, total_time_baseline / ite * 1000.f,
total_time_opt / ite * 1000.f, total_time_opt / ite * 1000.f,
...@@ -148,7 +148,7 @@ int main() ...@@ -148,7 +148,7 @@ int main()
// testActivationKernel<float>(tc); // testActivationKernel<float>(tc);
testActivationKernel<half>(tc); testActivationKernel<half>(tc);
} }
FT_LOG_INFO("testActivationKernel done"); TM_LOG_INFO("testActivationKernel done");
return 0; return 0;
} }
...@@ -15,19 +15,19 @@ ...@@ -15,19 +15,19 @@
*/ */
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
#include "src/fastertransformer/kernels/gen_relative_pos_bias.h" #include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include <curand.h> #include <curand.h>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
using namespace fastertransformer; using namespace turbomind;
namespace { namespace {
......
...@@ -10,17 +10,17 @@ ...@@ -10,17 +10,17 @@
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "unittest_utils.h" #include "unittest_utils.h"
using namespace fastertransformer; using namespace turbomind;
template<typename scalar_t> template<typename scalar_t>
__global__ void pad_query_kernel( __global__ void pad_query_kernel(
...@@ -216,7 +216,7 @@ static const char* usage = "Usage: %s <batch-size> <num-heads> <key-len> <query- ...@@ -216,7 +216,7 @@ static const char* usage = "Usage: %s <batch-size> <num-heads> <key-len> <query-
int main(int argc, const char* argv[]) int main(int argc, const char* argv[])
{ {
using namespace fastertransformer; using namespace turbomind;
using scalar_t = half; using scalar_t = half;
static const cudaDataType_t kCudaDataType = std::is_same<scalar_t, half>::value ? CUDA_R_16F : CUDA_R_32F; static const cudaDataType_t kCudaDataType = std::is_same<scalar_t, half>::value ? CUDA_R_16F : CUDA_R_32F;
......
...@@ -6,15 +6,15 @@ ...@@ -6,15 +6,15 @@
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "src/fastertransformer/layers/DenseWeight.h" #include "src/turbomind/layers/DenseWeight.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm.h" #include "src/turbomind/utils/gemm.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
using namespace fastertransformer; using namespace turbomind;
// Can be replaced by the function provided by a test framework // Can be replaced by the function provided by a test framework
...@@ -33,7 +33,7 @@ public: ...@@ -33,7 +33,7 @@ public:
#define EXPECT_TRUE(cond) \ #define EXPECT_TRUE(cond) \
do { if(!(cond)) { \ do { if(!(cond)) { \
FT_LOG_ERROR("TEST FAIL [%s] at %s:%d", \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \ __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } } while(false)
...@@ -42,7 +42,7 @@ public: ...@@ -42,7 +42,7 @@ public:
do { \ do { \
bool is_ok = checkResult<dtype,ctype>(name, out, ref); \ bool is_ok = checkResult<dtype,ctype>(name, out, ref); \
if(!is_ok) { \ if(!is_ok) { \
FT_LOG_ERROR("TEST FAIL [%s] at %s:%d", \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \ __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} \ } \
...@@ -81,7 +81,7 @@ public: ...@@ -81,7 +81,7 @@ public:
TensorWrapper(TensorWrapper const& other) TensorWrapper(TensorWrapper const& other)
: allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor) : allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
{ {
FT_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data); TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
} }
~TensorWrapper() ~TensorWrapper()
{ {
...@@ -220,11 +220,11 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa ...@@ -220,11 +220,11 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
bool ok = almostEqual(a, b, atol, rtol); bool ok = almostEqual(a, b, atol, rtol);
// Print the error. // Print the error.
if( !ok && failures < 4 ) { if( !ok && failures < 4 ) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i); TM_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a); TM_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b); TM_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b)); TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b)); TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
} }
// Update the number of failures. // Update the number of failures.
...@@ -233,7 +233,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa ...@@ -233,7 +233,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * out_size); size_t tol_failures = (size_t)(0.01 * out_size);
FT_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)", TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
name.c_str(), failures <= tol_failures ? "OK" : "FAILED", name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size, atol, rtol); 100. * failures / out_size, atol, rtol);
return failures <= tol_failures; return failures <= tol_failures;
...@@ -306,7 +306,7 @@ static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs ...@@ -306,7 +306,7 @@ static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
FT_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
...@@ -324,7 +324,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -324,7 +324,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) { for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
computeReference<computeType>(op_pair.transa, op_pair.transb, computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor); expected, a_tensor, b_tensor);
...@@ -362,7 +362,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -362,7 +362,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
...@@ -444,7 +444,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -444,7 +444,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
...@@ -514,7 +514,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -514,7 +514,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) { for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
...@@ -578,7 +578,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -578,7 +578,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) { void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
batch_size, m, n, k, toString<T, computeType>().c_str()); batch_size, m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
...@@ -693,7 +693,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t ...@@ -693,7 +693,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
// but let us keep these template variables for later use. // but let us keep these template variables for later use.
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
FT_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
...@@ -712,7 +712,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -712,7 +712,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) { for (auto &op_pair : op_pairs) {
// A/B will be switched in SpGemm. // A/B will be switched in SpGemm.
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues(); b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream, pruneMatrixB(b_tensor.data, stream,
...@@ -763,7 +763,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -763,7 +763,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
...@@ -799,7 +799,7 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -799,7 +799,7 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) { for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues(); b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream, pruneMatrixB(b_tensor.data, stream,
...@@ -904,6 +904,6 @@ int main(int argc, char* argv[]) { ...@@ -904,6 +904,6 @@ int main(int argc, char* argv[]) {
testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k); testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
} }
#endif #endif
FT_LOG_INFO("Test done"); TM_LOG_INFO("Test done");
return 0; return 0;
} }
#include <vector> #include <vector>
#include <random> #include <random>
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "unittest_utils.h" #include "unittest_utils.h"
......
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/kernels/transpose_int8_kernels.h" #include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
using namespace fastertransformer; using namespace turbomind;
class Int8TestSuite: public FtTestBase { class Int8TestSuite: public FtTestBase {
......
...@@ -6,15 +6,15 @@ ...@@ -6,15 +6,15 @@
#include <vector> #include <vector>
#include <sys/time.h> #include <sys/time.h>
#include "src/fastertransformer/kernels/logprob_kernels.h" #include "src/turbomind/kernels/logprob_kernels.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
using namespace fastertransformer; using namespace turbomind;
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
...@@ -182,7 +182,7 @@ public: ...@@ -182,7 +182,7 @@ public:
bool passed = checkResult(param.toString(), d_cum_log_probs, expected_cum_log_probs, batchxbeam); bool passed = checkResult(param.toString(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
FT_LOG_DEBUG("free host buffers"); TM_LOG_DEBUG("free host buffers");
delete[] expected_cum_log_probs; delete[] expected_cum_log_probs;
delete[] h_input_lengths; delete[] h_input_lengths;
delete[] h_input_ids; delete[] h_input_ids;
......
...@@ -27,16 +27,16 @@ ...@@ -27,16 +27,16 @@
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h" #include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
// #include "tests/unittests/unittest_utils.h" // #include "tests/unittests/unittest_utils.h"
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
using namespace fastertransformer; using namespace turbomind;
struct TemperatureTestParam { struct TemperatureTestParam {
size_t batch_size; size_t batch_size;
......
...@@ -9,17 +9,17 @@ ...@@ -9,17 +9,17 @@
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/unittest_utils.h" #include "tests/unittests/unittest_utils.h"
using namespace fastertransformer; using namespace turbomind;
struct TestCase { struct TestCase {
std::string name; std::string name;
...@@ -48,7 +48,7 @@ struct TestCase { ...@@ -48,7 +48,7 @@ struct TestCase {
void print() void print()
{ {
FT_LOG_INFO(toString()); TM_LOG_INFO(toString());
} }
}; };
...@@ -157,11 +157,11 @@ void testCumLogProbComputation(TestCase tc) ...@@ -157,11 +157,11 @@ void testCumLogProbComputation(TestCase tc)
memset(expected_cum_log_probs, 0, sizeof(float) * batch_size * beam_width); memset(expected_cum_log_probs, 0, sizeof(float) * batch_size * beam_width);
#ifndef NDEBUG #ifndef NDEBUG
FT_LOG_DEBUG("logit values"); TM_LOG_DEBUG("logit values");
printMatrixWithLimit(h_logits, batch_size * beam_width, vocab_size, vocab_size, false); printMatrixWithLimit(h_logits, batch_size * beam_width, vocab_size, vocab_size, false);
FT_LOG_DEBUG("\nprob values"); TM_LOG_DEBUG("\nprob values");
printMatrixWithLimit(h_probs, batch_size * beam_width, vocab_size, vocab_size, false); printMatrixWithLimit(h_probs, batch_size * beam_width, vocab_size, vocab_size, false);
FT_LOG_DEBUG("\nlog-prob values"); TM_LOG_DEBUG("\nlog-prob values");
printMatrixWithLimit(h_log_probs, batch_size * beam_width, vocab_size, vocab_size, false); printMatrixWithLimit(h_log_probs, batch_size * beam_width, vocab_size, vocab_size, false);
#endif #endif
...@@ -224,7 +224,7 @@ void testCumLogProbComputation(TestCase tc) ...@@ -224,7 +224,7 @@ void testCumLogProbComputation(TestCase tc)
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
FT_LOG_DEBUG("Step %2d generated ids", step); TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy( cudaD2Hcpy(
h_output_ids, h_output_ids,
(int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * (batch_size * beam_width)), (int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * (batch_size * beam_width)),
...@@ -234,7 +234,7 @@ void testCumLogProbComputation(TestCase tc) ...@@ -234,7 +234,7 @@ void testCumLogProbComputation(TestCase tc)
for (size_t i = 0; i < batch_size * beam_width; ++i) { for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i]; int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx]; expected_cum_log_probs[i] += (float)h_log_probs[idx];
FT_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) " TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e", "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step, (int)step,
(int)i, (int)i,
...@@ -246,7 +246,7 @@ void testCumLogProbComputation(TestCase tc) ...@@ -246,7 +246,7 @@ void testCumLogProbComputation(TestCase tc)
expected_cum_log_probs[i], expected_cum_log_probs[i],
(float)h_probs[idx]); (float)h_probs[idx]);
} }
FT_LOG_DEBUG(""); TM_LOG_DEBUG("");
#ifndef NDEBUG #ifndef NDEBUG
// print output ids // print output ids
...@@ -285,10 +285,10 @@ void testCumLogProbComputation(TestCase tc) ...@@ -285,10 +285,10 @@ void testCumLogProbComputation(TestCase tc)
void printTensors(TensorMap* map, size_t limit = 8) void printTensors(TensorMap* map, size_t limit = 8)
{ {
FT_LOG_INFO("Tensors:"); TM_LOG_INFO("Tensors:");
for (auto& kv : *map) { for (auto& kv : *map) {
Tensor t = kv.second; Tensor t = kv.second;
FT_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str()); TM_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
} }
} }
...@@ -468,13 +468,13 @@ private: ...@@ -468,13 +468,13 @@ private:
for (auto& expt : expts) { for (auto& expt : expts) {
ss << " " << expt; ss << " " << expt;
} }
FT_LOG_DEBUG("%s", ss.str().c_str()); TM_LOG_DEBUG("%s", ss.str().c_str());
} }
++failures; ++failures;
} }
} }
delete[] h_output_ids; delete[] h_output_ids;
FT_LOG_DEBUG("check...%6s : %s (failures: %d / %d)", TM_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
failures == 0 ? "....OK" : "FAILED", failures == 0 ? "....OK" : "FAILED",
name.c_str(), name.c_str(),
failures, failures,
...@@ -491,7 +491,7 @@ private: ...@@ -491,7 +491,7 @@ private:
float* temperature, float* temperature,
float* repetition_penalty) float* repetition_penalty)
{ {
FT_LOG_INFO("Test %s", name.c_str()); TM_LOG_INFO("Test %s", name.c_str());
std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16"); std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
bool passed = true; bool passed = true;
for (unsigned long long seed = 0; seed < max_seed; ++seed) { for (unsigned long long seed = 0; seed < max_seed; ++seed) {
...@@ -518,7 +518,7 @@ private: ...@@ -518,7 +518,7 @@ private:
passed &= is_ok; passed &= is_ok;
#ifndef NDEBUG #ifndef NDEBUG
if (!is_ok) { if (!is_ok) {
FT_LOG_ERROR("actual output ids"); TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true); printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
} }
#endif #endif
...@@ -526,7 +526,7 @@ private: ...@@ -526,7 +526,7 @@ private:
delete input_tensors; delete input_tensors;
this->teardown(); this->teardown();
} }
FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str()); TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
return passed; return passed;
} }
...@@ -539,7 +539,7 @@ private: ...@@ -539,7 +539,7 @@ private:
float* temperature, float* temperature,
float* repetition_penalty) float* repetition_penalty)
{ {
FT_LOG_INFO("Test %s", name.c_str()); TM_LOG_INFO("Test %s", name.c_str());
std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16"); std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
bool passed = true; bool passed = true;
size_t local_batch_size = 2; size_t local_batch_size = 2;
...@@ -567,7 +567,7 @@ private: ...@@ -567,7 +567,7 @@ private:
passed &= is_ok; passed &= is_ok;
#ifndef NDEBUG #ifndef NDEBUG
if (!is_ok) { if (!is_ok) {
FT_LOG_ERROR("actual output ids"); TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true); printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
} }
#endif #endif
...@@ -575,7 +575,7 @@ private: ...@@ -575,7 +575,7 @@ private:
delete input_tensors; delete input_tensors;
this->teardown(); this->teardown();
} }
FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str()); TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
return passed; return passed;
} }
...@@ -1229,7 +1229,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size) ...@@ -1229,7 +1229,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size)
for (size_t i = 0; i + period_size - 1 < size; i += period_size) { for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
for (size_t j = 1; j < period_size; ++j) { for (size_t j = 1; j < period_size; ++j) {
if (vals[i] != vals[i + j]) { if (vals[i] != vals[i + j]) {
FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]); TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
return false; return false;
} }
} }
...@@ -1244,7 +1244,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size, siz ...@@ -1244,7 +1244,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size, siz
for (size_t i = 0; i + period_size - 1 < size; i += period_size) { for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
for (size_t j = 1; j < period_size; ++j) { for (size_t j = 1; j < period_size; ++j) {
if (j != except && vals[i] != vals[i + j]) { if (j != except && vals[i] != vals[i + j]) {
FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]); TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
return false; return false;
} }
} }
...@@ -1284,7 +1284,7 @@ void testCuandBatchInitialize(const size_t batch_size) ...@@ -1284,7 +1284,7 @@ void testCuandBatchInitialize(const size_t batch_size)
// The same seed produces the same random number. // The same seed produces the same random number.
bool passed = isEqualInPeriod(h_rand_vals, batch_size, period_size); bool passed = isEqualInPeriod(h_rand_vals, batch_size, period_size);
FT_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED"); TM_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
delete h_rand_vals; delete h_rand_vals;
...@@ -1299,7 +1299,7 @@ void testCuandBatchInitialize(const size_t batch_size) ...@@ -1299,7 +1299,7 @@ void testCuandBatchInitialize(const size_t batch_size)
template<typename T, bool SINGLE_RANDOM_SEED, bool HAS_DIFF_ARGS, bool USE_LOCAL_BATCH> template<typename T, bool SINGLE_RANDOM_SEED, bool HAS_DIFF_ARGS, bool USE_LOCAL_BATCH>
void testSamplingLayerCurandInit(TestCase tc) void testSamplingLayerCurandInit(TestCase tc)
{ {
FT_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str()); TM_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
const DataType data_type = getTensorType<T>(); const DataType data_type = getTensorType<T>();
const size_t beam_width = 1; const size_t beam_width = 1;
...@@ -1376,7 +1376,7 @@ void testSamplingLayerCurandInit(TestCase tc) ...@@ -1376,7 +1376,7 @@ void testSamplingLayerCurandInit(TestCase tc)
deviceFill(d_end_id_buf, batch_size, end_id); deviceFill(d_end_id_buf, batch_size, end_id);
#ifndef NDEBUG #ifndef NDEBUG
FT_LOG_DEBUG("Random Seeds"); TM_LOG_DEBUG("Random Seeds");
printMatrixWithLimit(random_seed, 1, random_seed_size, random_seed_size, false); printMatrixWithLimit(random_seed, 1, random_seed_size, random_seed_size, false);
#endif #endif
...@@ -1400,7 +1400,7 @@ void testSamplingLayerCurandInit(TestCase tc) ...@@ -1400,7 +1400,7 @@ void testSamplingLayerCurandInit(TestCase tc)
cudaH2Dcpy(d_logits_buf, h_logits, batchxbeam * vocab_size); cudaH2Dcpy(d_logits_buf, h_logits, batchxbeam * vocab_size);
#ifndef NDEBUG #ifndef NDEBUG
FT_LOG_DEBUG("logit values"); TM_LOG_DEBUG("logit values");
printMatrixWithLimit(h_logits, batchxbeam, vocab_size, vocab_size, false); printMatrixWithLimit(h_logits, batchxbeam, vocab_size, vocab_size, false);
#endif #endif
for (uint ite = 0; ite < iteration_num; ++ite) { for (uint ite = 0; ite < iteration_num; ++ite) {
...@@ -1434,9 +1434,9 @@ void testSamplingLayerCurandInit(TestCase tc) ...@@ -1434,9 +1434,9 @@ void testSamplingLayerCurandInit(TestCase tc)
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
sync_check_cuda_error(); sync_check_cuda_error();
#ifndef NDEBUG #ifndef NDEBUG
FT_LOG_DEBUG("Step %2d generated ids", step); TM_LOG_DEBUG("Step %2d generated ids", step);
printMatrix(d_output_ids, max_seq_len, batchxbeam, batchxbeam, true); printMatrix(d_output_ids, max_seq_len, batchxbeam, batchxbeam, true);
FT_LOG_DEBUG(""); TM_LOG_DEBUG("");
#endif #endif
// check results. // check results.
cudaD2Hcpy(h_output_ids, cudaD2Hcpy(h_output_ids,
...@@ -1452,7 +1452,7 @@ void testSamplingLayerCurandInit(TestCase tc) ...@@ -1452,7 +1452,7 @@ void testSamplingLayerCurandInit(TestCase tc)
HAS_DIFF_ARGS ? "true" : "false", HAS_DIFF_ARGS ? "true" : "false",
USE_LOCAL_BATCH ? "true" : "false", USE_LOCAL_BATCH ? "true" : "false",
(std::is_same<T, float>::value ? " fp32" : " fp16")); (std::is_same<T, float>::value ? " fp32" : " fp16"));
FT_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str()); TM_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
free(h_logits); free(h_logits);
...@@ -1495,13 +1495,13 @@ int main() ...@@ -1495,13 +1495,13 @@ int main()
testCumLogProbComputation<float>(tc); testCumLogProbComputation<float>(tc);
testCumLogProbComputation<half>(tc); testCumLogProbComputation<half>(tc);
} }
FT_LOG_INFO("testCumLogProbComputation done"); TM_LOG_INFO("testCumLogProbComputation done");
SamplingDecodeTest<float> sampling_decode_test; SamplingDecodeTest<float> sampling_decode_test;
sampling_decode_test.testAll(); sampling_decode_test.testAll();
testCuandBatchInitialize(127); testCuandBatchInitialize(127);
FT_LOG_INFO("testCuandBatchInitialize done"); TM_LOG_INFO("testCuandBatchInitialize done");
#define LAUNCH_VARIANTS(T, tc, local_batch) \ #define LAUNCH_VARIANTS(T, tc, local_batch) \
testSamplingLayerCurandInit<T, true, false, local_batch>(tc); \ testSamplingLayerCurandInit<T, true, false, local_batch>(tc); \
...@@ -1515,7 +1515,7 @@ int main() ...@@ -1515,7 +1515,7 @@ int main()
LAUNCH_VARIANTS(half, tc, true); LAUNCH_VARIANTS(half, tc, true);
} }
#undef LAUNCH_VARIANTS #undef LAUNCH_VARIANTS
FT_LOG_INFO("testSamplingLayerCurandInit done"); TM_LOG_INFO("testSamplingLayerCurandInit done");
return 0; return 0;
} }
...@@ -10,18 +10,18 @@ ...@@ -10,18 +10,18 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
using namespace fastertransformer; using namespace turbomind;
namespace { namespace {
......
...@@ -9,18 +9,18 @@ ...@@ -9,18 +9,18 @@
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
// #include "tests/unittests/unittest_utils.h" // #include "tests/unittests/unittest_utils.h"
#include "tests/unittests/gtest_utils.h" #include "tests/unittests/gtest_utils.h"
using namespace fastertransformer; using namespace turbomind;
struct SamplingLayerTestParam { struct SamplingLayerTestParam {
size_t batch_size; size_t batch_size;
...@@ -256,12 +256,12 @@ protected: ...@@ -256,12 +256,12 @@ protected:
for (auto& expt : expts) { for (auto& expt : expts) {
ss << " " << expt; ss << " " << expt;
} }
FT_LOG_DEBUG("%s", ss.str().c_str()); TM_LOG_DEBUG("%s", ss.str().c_str());
} }
++failures; ++failures;
} }
} }
FT_LOG_DEBUG("check...%6s : failures: %d / %d", TM_LOG_DEBUG("check...%6s : failures: %d / %d",
failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam); failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
delete[] h_output_ids; delete[] h_output_ids;
return failures == 0; return failures == 0;
...@@ -302,7 +302,7 @@ public: ...@@ -302,7 +302,7 @@ public:
EXPECT_TRUE(passed) << "Failed at seed " << seed; EXPECT_TRUE(passed) << "Failed at seed " << seed;
#ifndef NDEBUG #ifndef NDEBUG
if (!passed) { if (!passed) {
FT_LOG_ERROR("actual output ids"); TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true); printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
} }
#endif #endif
...@@ -867,7 +867,7 @@ protected: ...@@ -867,7 +867,7 @@ protected:
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors); &dynamic_decode_input_tensors);
FT_LOG_DEBUG("Step %2d generated ids", step); TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(h_output_ids, cudaD2Hcpy(h_output_ids,
dynamic_decode_output_tensors dynamic_decode_output_tensors
.at("output_ids") .at("output_ids")
...@@ -878,14 +878,14 @@ protected: ...@@ -878,14 +878,14 @@ protected:
for (size_t i = 0; i < batch_size * beam_width; ++i) { for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i]; int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx]; expected_cum_log_probs[i] += (float)h_log_probs[idx];
FT_LOG_DEBUG( TM_LOG_DEBUG(
"| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) " "| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e", "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step, (int)i, (int)idx, (int)h_output_ids[i], (int)step, (int)i, (int)idx, (int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx], h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]); h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
} }
FT_LOG_DEBUG(""); TM_LOG_DEBUG("");
} }
bool passed = checkResult(param.toString(), cum_log_probs, expected_cum_log_probs, batch_size * beam_width); bool passed = checkResult(param.toString(), cum_log_probs, expected_cum_log_probs, batch_size * beam_width);
......
...@@ -4,9 +4,9 @@ ...@@ -4,9 +4,9 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
using namespace fastertransformer; using namespace turbomind;
namespace { namespace {
......
...@@ -26,15 +26,15 @@ ...@@ -26,15 +26,15 @@
#include <string> // string #include <string> // string
#include <vector> // vector #include <vector> // vector
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/fastertransformer/utils/string_utils.h" #include "src/turbomind/utils/string_utils.h"
#define PRINT_LIMIT 16 #define PRINT_LIMIT 16
#define EPSILON (1e-20) #define EPSILON (1e-20)
#define EPSILON_FP16 (1e-10) #define EPSILON_FP16 (1e-10)
using namespace fastertransformer; using namespace turbomind;
class TestFailureError : public std::exception { class TestFailureError : public std::exception {
private: private:
...@@ -51,14 +51,14 @@ public: ...@@ -51,14 +51,14 @@ public:
#define EXPECT_TRUE(cond) \ #define EXPECT_TRUE(cond) \
do { if(!(cond)) { \ do { if(!(cond)) { \
FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \ __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } } while(false)
#define EXPECT_FALSE(cond) \ #define EXPECT_FALSE(cond) \
do { if(cond) { \ do { if(cond) { \
FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \ __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } } while(false)
...@@ -92,11 +92,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -92,11 +92,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
bool ok = almostEqual(a, b, atol, rtol); bool ok = almostEqual(a, b, atol, rtol);
// Print the error. // Print the error.
if (!ok && failures < 4) { if (!ok && failures < 4) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i); TM_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a); TM_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b); TM_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b)); TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b)); TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
} }
// Update the number of failures. // Update the number of failures.
failures += ok ? 0 : 1; failures += ok ? 0 : 1;
...@@ -108,7 +108,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -108,7 +108,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size); size_t tol_failures = (size_t)(0.01 * size);
FT_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)", TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(), failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
100. * failures / size, atol, rtol, 100. * relative_gap); 100. * failures / size, atol, rtol, 100. * relative_gap);
return failures <= tol_failures; return failures <= tol_failures;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment