Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -22,9 +22,9 @@
#include <torch/custom_class.h>
#include <torch/script.h>
#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/fastertransformer/th_op/th_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/turbomind/th_op/th_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "cutlass/numeric_types.h"
......@@ -32,7 +32,7 @@ using torch::Tensor;
namespace torch_ext {
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T, typename WeightType>
Tensor fused_gemm_dq_helper(
......@@ -48,7 +48,7 @@ Tensor fused_gemm_dq_helper(
const WeightType* weight_ptr = get_ptr<const WeightType>(weight);
const T* scales_ptr = get_ptr<const T>(scales);
fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
......@@ -152,7 +152,7 @@ Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales)
Tensor
bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time)
{
using namespace fastertransformer;
using namespace turbomind;
const int m = input_activations.size(0);
const int n = weight_dequantized.size(1);
const int k = input_activations.size(1);
......@@ -257,7 +257,7 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* scales_ptr = get_ptr<const T>(scales);
const T* bias_ptr = get_ptr<const T>(bias);
fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
......
......@@ -24,17 +24,17 @@
#include <torch/custom_class.h>
#include <torch/script.h>
#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/fastertransformer/th_op/th_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/turbomind/th_op/th_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/logger.h"
#include "cutlass/numeric_types.h"
using torch::Tensor;
using torch_ext::get_ptr;
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T>
void int8_gemm_test(
......@@ -143,9 +143,9 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
FT_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else {
FT_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
}
}
......@@ -153,7 +153,7 @@ void int8_gemm_test(
int main(int argc, char **argv)
{
if (argc != 7) {
FT_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0;
}
......
......@@ -8,12 +8,12 @@
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/logger.h"
namespace ft = fastertransformer;
namespace ft = turbomind;
namespace {
......@@ -48,11 +48,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
bool ok = almostEqual(a, b, atol, rtol);
// Print the error.
if (!ok && failures < 4) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b);
TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
}
// Update the number of failures.
failures += ok ? 0 : 1;
......@@ -65,7 +65,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size);
if (failures > tol_failures) {
FT_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
TM_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
name.c_str(), 100. * failures / size, atol, rtol, 100. * relative_gap);
}
return failures <= tol_failures;
......
......@@ -2,14 +2,14 @@
#include <string> // std::string
#include <vector> // std::vector
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/logger.h"
#include "unittest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
#define PRINT_LIMIT 16
#define EPSILON (1e-20)
......@@ -30,7 +30,7 @@ struct TestCase {
void print()
{
FT_LOG_INFO(toString());
TM_LOG_INFO(toString());
}
};
......@@ -107,7 +107,7 @@ void testActivationKernel(TestCase tc)
invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
}
float total_time_opt = cuda_timer_opt.stop();
FT_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
TM_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
tc.toString().c_str(),
total_time_baseline / ite * 1000.f,
total_time_opt / ite * 1000.f,
......@@ -148,7 +148,7 @@ int main()
// testActivationKernel<float>(tc);
testActivationKernel<half>(tc);
}
FT_LOG_INFO("testActivationKernel done");
TM_LOG_INFO("testActivationKernel done");
return 0;
}
......@@ -15,19 +15,19 @@
*/
#include "tests/unittests/gtest_utils.h"
#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include <curand.h>
#include <sstream>
#include <stdexcept>
#include <vector>
using namespace fastertransformer;
using namespace turbomind;
namespace {
......
......@@ -10,17 +10,17 @@
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include "unittest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
template<typename scalar_t>
__global__ void pad_query_kernel(
......@@ -216,7 +216,7 @@ static const char* usage = "Usage: %s <batch-size> <num-heads> <key-len> <query-
int main(int argc, const char* argv[])
{
using namespace fastertransformer;
using namespace turbomind;
using scalar_t = half;
static const cudaDataType_t kCudaDataType = std::is_same<scalar_t, half>::value ? CUDA_R_16F : CUDA_R_32F;
......
......@@ -6,15 +6,15 @@
#include <tuple>
#include <vector>
#include "src/fastertransformer/layers/DenseWeight.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/DenseWeight.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
using namespace fastertransformer;
using namespace turbomind;
// Can be replaced by the function provided by a test framework
......@@ -33,7 +33,7 @@ public:
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
FT_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
......@@ -42,7 +42,7 @@ public:
do { \
bool is_ok = checkResult<dtype,ctype>(name, out, ref); \
if(!is_ok) { \
FT_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
......@@ -81,7 +81,7 @@ public:
TensorWrapper(TensorWrapper const& other)
: allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
{
FT_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
}
~TensorWrapper()
{
......@@ -220,11 +220,11 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
bool ok = almostEqual(a, b, atol, rtol);
// Print the error.
if( !ok && failures < 4 ) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b);
TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
}
// Update the number of failures.
......@@ -233,7 +233,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * out_size);
FT_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size, atol, rtol);
return failures <= tol_failures;
......@@ -306,7 +306,7 @@ static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs
template<typename T, DataType computeType>
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
FT_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -324,7 +324,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name);
TM_LOG_DEBUG(tc_name);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
......@@ -362,7 +362,7 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType>
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
......@@ -444,7 +444,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType>
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
......@@ -514,7 +514,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name);
TM_LOG_DEBUG(tc_name);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
......@@ -578,7 +578,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType>
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
batch_size, m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
......@@ -693,7 +693,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
// but let us keep these template variables for later use.
template<typename T, DataType computeType>
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
FT_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -712,7 +712,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) {
// A/B will be switched in SpGemm.
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
......@@ -763,7 +763,7 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
template<typename T, DataType computeType>
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
// Test if Gemm is consistent with cublasWrapper
FT_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
......@@ -799,7 +799,7 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
for (auto &op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
FT_LOG_DEBUG(tc_name);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
......@@ -904,6 +904,6 @@ int main(int argc, char* argv[]) {
testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
}
#endif
FT_LOG_INFO("Test done");
TM_LOG_INFO("Test done");
return 0;
}
#include <vector>
#include <random>
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
#include "unittest_utils.h"
......
......@@ -5,10 +5,10 @@
#include <string>
#include <vector>
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm>
#include <iostream>
......@@ -16,7 +16,7 @@
#include "tests/unittests/gtest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
class Int8TestSuite: public FtTestBase {
......
......@@ -6,15 +6,15 @@
#include <vector>
#include <sys/time.h>
#include "src/fastertransformer/kernels/logprob_kernels.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
////////////////////////////////////////////////////////////////////////////////////
......@@ -182,7 +182,7 @@ public:
bool passed = checkResult(param.toString(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed);
FT_LOG_DEBUG("free host buffers");
TM_LOG_DEBUG("free host buffers");
delete[] expected_cum_log_probs;
delete[] h_input_lengths;
delete[] h_input_ids;
......
......@@ -27,16 +27,16 @@
#include <cublasLt.h>
#include <cuda_runtime.h>
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
// #include "tests/unittests/unittest_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
struct TemperatureTestParam {
size_t batch_size;
......
......@@ -9,17 +9,17 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/unittest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
struct TestCase {
std::string name;
......@@ -48,7 +48,7 @@ struct TestCase {
void print()
{
FT_LOG_INFO(toString());
TM_LOG_INFO(toString());
}
};
......@@ -157,11 +157,11 @@ void testCumLogProbComputation(TestCase tc)
memset(expected_cum_log_probs, 0, sizeof(float) * batch_size * beam_width);
#ifndef NDEBUG
FT_LOG_DEBUG("logit values");
TM_LOG_DEBUG("logit values");
printMatrixWithLimit(h_logits, batch_size * beam_width, vocab_size, vocab_size, false);
FT_LOG_DEBUG("\nprob values");
TM_LOG_DEBUG("\nprob values");
printMatrixWithLimit(h_probs, batch_size * beam_width, vocab_size, vocab_size, false);
FT_LOG_DEBUG("\nlog-prob values");
TM_LOG_DEBUG("\nlog-prob values");
printMatrixWithLimit(h_log_probs, batch_size * beam_width, vocab_size, vocab_size, false);
#endif
......@@ -224,7 +224,7 @@ void testCumLogProbComputation(TestCase tc)
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
FT_LOG_DEBUG("Step %2d generated ids", step);
TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(
h_output_ids,
(int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * (batch_size * beam_width)),
......@@ -234,7 +234,7 @@ void testCumLogProbComputation(TestCase tc)
for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx];
FT_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step,
(int)i,
......@@ -246,7 +246,7 @@ void testCumLogProbComputation(TestCase tc)
expected_cum_log_probs[i],
(float)h_probs[idx]);
}
FT_LOG_DEBUG("");
TM_LOG_DEBUG("");
#ifndef NDEBUG
// print output ids
......@@ -285,10 +285,10 @@ void testCumLogProbComputation(TestCase tc)
void printTensors(TensorMap* map, size_t limit = 8)
{
FT_LOG_INFO("Tensors:");
TM_LOG_INFO("Tensors:");
for (auto& kv : *map) {
Tensor t = kv.second;
FT_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
TM_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
}
}
......@@ -468,13 +468,13 @@ private:
for (auto& expt : expts) {
ss << " " << expt;
}
FT_LOG_DEBUG("%s", ss.str().c_str());
TM_LOG_DEBUG("%s", ss.str().c_str());
}
++failures;
}
}
delete[] h_output_ids;
FT_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
TM_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
failures == 0 ? "....OK" : "FAILED",
name.c_str(),
failures,
......@@ -491,7 +491,7 @@ private:
float* temperature,
float* repetition_penalty)
{
FT_LOG_INFO("Test %s", name.c_str());
TM_LOG_INFO("Test %s", name.c_str());
std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
bool passed = true;
for (unsigned long long seed = 0; seed < max_seed; ++seed) {
......@@ -518,7 +518,7 @@ private:
passed &= is_ok;
#ifndef NDEBUG
if (!is_ok) {
FT_LOG_ERROR("actual output ids");
TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
}
#endif
......@@ -526,7 +526,7 @@ private:
delete input_tensors;
this->teardown();
}
FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
return passed;
}
......@@ -539,7 +539,7 @@ private:
float* temperature,
float* repetition_penalty)
{
FT_LOG_INFO("Test %s", name.c_str());
TM_LOG_INFO("Test %s", name.c_str());
std::string tag = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
bool passed = true;
size_t local_batch_size = 2;
......@@ -567,7 +567,7 @@ private:
passed &= is_ok;
#ifndef NDEBUG
if (!is_ok) {
FT_LOG_ERROR("actual output ids");
TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
}
#endif
......@@ -575,7 +575,7 @@ private:
delete input_tensors;
this->teardown();
}
FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
TM_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
return passed;
}
......@@ -1229,7 +1229,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size)
for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
for (size_t j = 1; j < period_size; ++j) {
if (vals[i] != vals[i + j]) {
FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
return false;
}
}
......@@ -1244,7 +1244,7 @@ static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size, siz
for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
for (size_t j = 1; j < period_size; ++j) {
if (j != except && vals[i] != vals[i + j]) {
FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
TM_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
return false;
}
}
......@@ -1284,7 +1284,7 @@ void testCuandBatchInitialize(const size_t batch_size)
// The same seed produces the same random number.
bool passed = isEqualInPeriod(h_rand_vals, batch_size, period_size);
FT_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
TM_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
EXPECT_TRUE(passed);
delete h_rand_vals;
......@@ -1299,7 +1299,7 @@ void testCuandBatchInitialize(const size_t batch_size)
template<typename T, bool SINGLE_RANDOM_SEED, bool HAS_DIFF_ARGS, bool USE_LOCAL_BATCH>
void testSamplingLayerCurandInit(TestCase tc)
{
FT_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
TM_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
const DataType data_type = getTensorType<T>();
const size_t beam_width = 1;
......@@ -1376,7 +1376,7 @@ void testSamplingLayerCurandInit(TestCase tc)
deviceFill(d_end_id_buf, batch_size, end_id);
#ifndef NDEBUG
FT_LOG_DEBUG("Random Seeds");
TM_LOG_DEBUG("Random Seeds");
printMatrixWithLimit(random_seed, 1, random_seed_size, random_seed_size, false);
#endif
......@@ -1400,7 +1400,7 @@ void testSamplingLayerCurandInit(TestCase tc)
cudaH2Dcpy(d_logits_buf, h_logits, batchxbeam * vocab_size);
#ifndef NDEBUG
FT_LOG_DEBUG("logit values");
TM_LOG_DEBUG("logit values");
printMatrixWithLimit(h_logits, batchxbeam, vocab_size, vocab_size, false);
#endif
for (uint ite = 0; ite < iteration_num; ++ite) {
......@@ -1434,9 +1434,9 @@ void testSamplingLayerCurandInit(TestCase tc)
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
sync_check_cuda_error();
#ifndef NDEBUG
FT_LOG_DEBUG("Step %2d generated ids", step);
TM_LOG_DEBUG("Step %2d generated ids", step);
printMatrix(d_output_ids, max_seq_len, batchxbeam, batchxbeam, true);
FT_LOG_DEBUG("");
TM_LOG_DEBUG("");
#endif
// check results.
cudaD2Hcpy(h_output_ids,
......@@ -1452,7 +1452,7 @@ void testSamplingLayerCurandInit(TestCase tc)
HAS_DIFF_ARGS ? "true" : "false",
USE_LOCAL_BATCH ? "true" : "false",
(std::is_same<T, float>::value ? " fp32" : " fp16"));
FT_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
TM_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
EXPECT_TRUE(passed);
free(h_logits);
......@@ -1495,13 +1495,13 @@ int main()
testCumLogProbComputation<float>(tc);
testCumLogProbComputation<half>(tc);
}
FT_LOG_INFO("testCumLogProbComputation done");
TM_LOG_INFO("testCumLogProbComputation done");
SamplingDecodeTest<float> sampling_decode_test;
sampling_decode_test.testAll();
testCuandBatchInitialize(127);
FT_LOG_INFO("testCuandBatchInitialize done");
TM_LOG_INFO("testCuandBatchInitialize done");
#define LAUNCH_VARIANTS(T, tc, local_batch) \
testSamplingLayerCurandInit<T, true, false, local_batch>(tc); \
......@@ -1515,7 +1515,7 @@ int main()
LAUNCH_VARIANTS(half, tc, true);
}
#undef LAUNCH_VARIANTS
FT_LOG_INFO("testSamplingLayerCurandInit done");
TM_LOG_INFO("testSamplingLayerCurandInit done");
return 0;
}
......@@ -10,18 +10,18 @@
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
namespace {
......
......@@ -9,18 +9,18 @@
#include <cublasLt.h>
#include <cuda_runtime.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
// #include "tests/unittests/unittest_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace fastertransformer;
using namespace turbomind;
struct SamplingLayerTestParam {
size_t batch_size;
......@@ -256,12 +256,12 @@ protected:
for (auto& expt : expts) {
ss << " " << expt;
}
FT_LOG_DEBUG("%s", ss.str().c_str());
TM_LOG_DEBUG("%s", ss.str().c_str());
}
++failures;
}
}
FT_LOG_DEBUG("check...%6s : failures: %d / %d",
TM_LOG_DEBUG("check...%6s : failures: %d / %d",
failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
delete[] h_output_ids;
return failures == 0;
......@@ -302,7 +302,7 @@ public:
EXPECT_TRUE(passed) << "Failed at seed " << seed;
#ifndef NDEBUG
if (!passed) {
FT_LOG_ERROR("actual output ids");
TM_LOG_ERROR("actual output ids");
printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
}
#endif
......@@ -867,7 +867,7 @@ protected:
dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors);
FT_LOG_DEBUG("Step %2d generated ids", step);
TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(h_output_ids,
dynamic_decode_output_tensors
.at("output_ids")
......@@ -878,14 +878,14 @@ protected:
for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx];
FT_LOG_DEBUG(
TM_LOG_DEBUG(
"| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step, (int)i, (int)idx, (int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
}
FT_LOG_DEBUG("");
TM_LOG_DEBUG("");
}
bool passed = checkResult(param.toString(), cum_log_probs, expected_cum_log_probs, batch_size * beam_width);
......
......@@ -4,9 +4,9 @@
#include <gtest/gtest.h>
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/utils/Tensor.h"
using namespace fastertransformer;
using namespace turbomind;
namespace {
......
......@@ -26,15 +26,15 @@
#include <string> // string
#include <vector> // vector
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/fastertransformer/utils/string_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/string_utils.h"
#define PRINT_LIMIT 16
#define EPSILON (1e-20)
#define EPSILON_FP16 (1e-10)
using namespace fastertransformer;
using namespace turbomind;
class TestFailureError : public std::exception {
private:
......@@ -51,14 +51,14 @@ public:
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \
FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
......@@ -92,11 +92,11 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
bool ok = almostEqual(a, b, atol, rtol);
// Print the error.
if (!ok && failures < 4) {
FT_LOG_ERROR(">> invalid result for i=%lu:", i);
FT_LOG_ERROR(">> found......: %10.6f", a);
FT_LOG_ERROR(">> expected...: %10.6f", b);
FT_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
FT_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b);
TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b));
TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b));
}
// Update the number of failures.
failures += ok ? 0 : 1;
......@@ -108,7 +108,7 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size);
FT_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
100. * failures / size, atol, rtol, 100. * relative_gap);
return failures <= tol_failures;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment