Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -24,6 +24,12 @@
namespace turbomind {
// cub.cuh brings windows.h
// should be included after cub.cuh
#ifdef ERROR
#undef ERROR
#endif
class Logger {
public:
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h"
......@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
}
// get slices
ConcateSlice slice0{.slices = {{0, dim0}}};
ConcateSlice slice1{.slices = {{0, dim1}}};
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0];
}
......
......@@ -15,6 +15,7 @@
*/
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/macro.h"
#include <atomic>
namespace turbomind {
......
......@@ -18,7 +18,7 @@
#include "nvtx_utils.h"
#ifdef USE_NVTX
#include "nvToolsExt.h"
#include "nvtx3/nvToolsExt.h"
#endif
namespace ft_nvtx {
......
......@@ -14,11 +14,11 @@
* limitations under the License.
*/
#include <chrono>
#include <cstdlib>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <chrono>
#include "torch/csrc/cuda/Stream.h"
#include <torch/custom_class.h>
......@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
namespace ft = turbomind;
template<typename T>
void int8_gemm_test(
const int m,
void int8_gemm_test(const int m,
const int n,
const int k,
const at::ScalarType output_data_type,
const QuantMode quant_mode,
const int iters)
{
const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerChannelQuant;
const bool per_token_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
const int row_scale_size = per_token_quant ? m : 1;
const int col_scale_size = per_channel_quant ? n : 1;
......@@ -82,10 +81,10 @@ void int8_gemm_test(
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
......@@ -117,7 +116,8 @@ void int8_gemm_test(
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
"y_gpu_int32.npy");
ft::check_cuda_error(cudaStreamSynchronize(stream));
auto start = high_resolution_clock::now();
......@@ -142,26 +142,29 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
if (torch::allclose(
(y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else {
}
else {
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
// alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
}
}
int main(int argc, char **argv)
int main(int argc, char** argv)
{
if (argc != 7) {
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
TM_LOG_ERROR(
"arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0;
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
at::ScalarType::Half : at::ScalarType::BFloat16;
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
if (quant_mode == QuantMode::PerChannelQuant) {
printf("per channel quant \n");
......@@ -170,7 +173,8 @@ int main(int argc, char **argv)
if (output_data_type == at::ScalarType::Half) {
int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
} else {
}
else {
#if ENABLE_BF16
int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
#endif
......
......@@ -20,7 +20,12 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1
)
add_definitions(-DTORCH_CUDA=1)
find_package(CUDAToolkit REQUIRED)
if (NOT MSVC)
add_definitions(-DTORCH_CUDA=1)
endif()
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
......@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
# Sorted by alphabetical order of test name.
target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC
-lcudart -lcurand
CUDA::cudart CUDA::curand
gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC
-lcudart
CUDA::cudart
logprob_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC
-lcudart
CUDA::cudart
sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
target_link_libraries( # Libs for test_sampling_layer
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
cublasMMWrapper memory_utils
DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
target_link_libraries( # Libs for test_tensor
......@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor
remove_definitions(-DTORCH_CUDA=1)
add_executable(test_gemm test_gemm.cu)
target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC
......@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart
Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
unfused_attention_kernels
memory_utils tensor cublasMMWrapper cuda_utils logger)
......@@ -14,13 +14,12 @@
* limitations under the License.
*/
#include "gtest_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "gtest_utils.h"
#include <curand.h>
#include <sstream>
......
......@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
// compute actual
using AttentionOp = FlashAttentionOp<scalar_t>;
using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = seq_len * size_per_head};
Layout layout_k{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_v{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_o{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = num_heads * size_per_head,
.stride_head = size_per_head,
.use_seqlens = true};
Layout layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
Layout layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
typename AttentionOp::Params attn_params{.attn_out = actual_out_ptr,
.query = query_ptr,
.key = key_ptr,
.val = val_ptr,
.mask = mask_ptr,
.out_accum = accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr,
.cu_seqlens_k = nullptr,
.group_size = 1,
.layout_q = layout_q,
.layout_k = layout_k,
.layout_v = layout_v,
.layout_o = layout_o};
typename AttentionOp::Params attn_params{actual_out_ptr,
query_ptr,
key_ptr,
val_ptr,
mask_ptr,
accum_buf_ptr,
cu_seqlens_ptr,
nullptr,
1,
layout_q,
layout_k,
layout_v,
layout_o};
flash_attention(attn_params, stream);
sync_check_cuda_error();
......
#include <assert.h>
#include <math.h>
#include <cublas_v2.h>
#include <math.h>
#include <numeric>
#include <stdexcept>
#include <tuple>
......@@ -18,35 +18,38 @@ using namespace turbomind;
// Can be replaced by the function provided by a test framework
class TestFailureError : public std::exception {
class TestFailureError: public std::exception {
private:
std::string msg_;
public:
explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") {
explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
}
const char* what () const throw () {
const char* what() const throw()
{
return msg_.c_str();
}
};
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
do { \
if (!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
} \
} while (false)
#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \
do { \
bool is_ok = checkResult<dtype,ctype>(name, out, ref); \
if(!is_ok) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
bool is_ok = checkResult<dtype, ctype>(name, out, ref); \
if (!is_ok) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while(false)
} while (false)
////////////////////////////////////////////////////////////////////////////////////
......@@ -72,14 +75,15 @@ public:
this->data = this->allocator->malloc(tensor_memsize, false);
if (zero_init) {
check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
} else {
}
else {
setRandomValues();
}
this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
}
TensorWrapper(TensorWrapper const& other)
: allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
TensorWrapper(TensorWrapper const& other):
allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
{
TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
}
......@@ -97,7 +101,8 @@ public:
check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
}
void setRandomValues() {
void setRandomValues()
{
// random initialization
size_t num_elements = this->size();
switch (this->type) {
......@@ -113,7 +118,8 @@ public:
}
}
size_t size() {
size_t size()
{
size_t n_elements = 1;
for (size_t s : this->shape) {
n_elements *= s;
......@@ -121,7 +127,8 @@ public:
return n_elements;
}
size_t memsize() {
size_t memsize()
{
size_t type_size = 0;
switch (this->type) {
case TYPE_FP32:
......@@ -170,12 +177,20 @@ void computeReference(GemmOp transa,
check_cuda_error(cublasGemmEx(cublas_handle,
getCublasOperation(transb),
getCublasOperation(transa),
n, m, k,
n,
m,
k,
_alpha,
(const void*)B.data, btype, ldb,
(const void*)A.data, atype, lda,
(const void*)B.data,
btype,
ldb,
(const void*)A.data,
atype,
lda,
_beta,
(void*)C.data, ctype, ldc,
(void*)C.data,
ctype,
ldc,
compute_type,
CUBLAS_GEMM_DEFAULT));
check_cuda_error(cublasDestroy(cublas_handle));
......@@ -199,7 +214,8 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
}
template<typename T>
bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) {
bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol)
{
assert(out.type == ref.type);
size_t out_size = out.size();
......@@ -219,7 +235,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
bool ok = almostEqual(a, b, atol, rtol);
// Print the error.
if( !ok && failures < 4 ) {
if (!ok && failures < 4) {
TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b);
......@@ -234,38 +250,46 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * out_size);
TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size, atol, rtol);
name.c_str(),
failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size,
atol,
rtol);
return failures <= tol_failures;
}
template<typename T, DataType computeType>
bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) {
bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref)
{
float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
bool is_ok = false;
if (sizeof(T) == 4) {
is_ok = _checkResult<float>(name, out, ref, atol, rtol);
} else {
}
else {
is_ok = _checkResult<half>(name, out, ref, atol, rtol);
}
return is_ok;
}
template<typename T, DataType computeType>
bool checkResult(TensorWrapper& out, TensorWrapper& ref) {
bool checkResult(TensorWrapper& out, TensorWrapper& ref)
{
return checkResult<T, computeType>("", out, ref);
}
template<typename T>
std::string toString() {
std::string toString()
{
std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16";
return str;
}
template<typename T, DataType ctype>
std::string toString() {
std::string toString()
{
std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16";
str += ", compute_type=";
......@@ -273,7 +297,8 @@ std::string toString() {
return str;
}
std::string toString(GemmOp op) {
std::string toString(GemmOp op)
{
return op == GEMM_OP_N ? "N" : "T";
}
......@@ -282,32 +307,32 @@ struct GemmOpPair {
GemmOp transb;
};
static const std::vector<GemmOpPair> op_pairs {{GEMM_OP_N, GEMM_OP_N},
{GEMM_OP_N, GEMM_OP_T},
{GEMM_OP_T, GEMM_OP_N},
{GEMM_OP_T, GEMM_OP_T}};
static const std::vector<GemmOpPair> op_pairs{
{GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}};
static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb,
size_t m, size_t n, size_t k)
static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k)
{
return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(),
m, n, k);
func_name,
getGemmOpString(transa).c_str(),
getGemmOpString(transb).c_str(),
m,
n,
k);
}
static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs,
size_t m, size_t n, size_t k)
static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k)
{
return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
}
/////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T, DataType computeType>
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
{
TM_LOG_INFO(
"Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -322,48 +347,60 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_tensor.data, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_tensor.data, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
check_cuda_error(cudaStreamDestroy(stream));
}
template<typename T, DataType computeType>
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
TM_LOG_INFO(
"Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
......@@ -382,12 +419,8 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
......@@ -396,7 +429,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does.
......@@ -405,33 +438,50 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
size_t ldc = n;
cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
getCublasOperation(op_pair.transa),
n, m, k,
b_tensor.data, ldb,
a_tensor.data, lda,
expected.data, ldc);
n,
m,
k,
b_tensor.data,
ldb,
a_tensor.data,
lda,
expected.data,
ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_tensor.data, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_tensor.data, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
......@@ -442,10 +492,14 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
}
template<typename T, DataType computeType>
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
......@@ -484,8 +538,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
(const T*)expecteds[2]->data};
T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
check_cuda_error(cudaMemcpyAsync(
(void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
......@@ -498,12 +551,8 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
......@@ -512,7 +561,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_type, b_type, c_type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
......@@ -526,42 +575,51 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
n,
m,
k,
(const void* const*)batch_b, ldb,
(const void* const*)batch_a, lda,
(void* const*)batch_expected, ldc,
(const void* const*)batch_b,
ldb,
(const void* const*)batch_a,
lda,
(void* const*)batch_expected,
ldc,
batch_size);
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, a_type, lda,
batch_b, b_type, ldb,
batch_c, c_type, ldc,
gemm->batchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
batch_a,
a_type,
lda,
batch_b,
b_type,
ldb,
batch_c,
c_type,
ldc,
batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api1 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues();
}
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, lda,
batch_b, ldb,
batch_c, ldc,
batch_size);
gemm->batchedGemm(
op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api2 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues();
}
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, batch_b, batch_c, batch_size);
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api3 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
}
a_tensors.clear();
......@@ -574,12 +632,16 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cudaStreamDestroy(stream));
}
template<typename T, DataType computeType>
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
batch_size, m, n, k, toString<T, computeType>().c_str());
batch_size,
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
......@@ -598,12 +660,8 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
......@@ -612,7 +670,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does.
......@@ -650,35 +708,78 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
getCublasDataType(computeType));
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda, stridea,
b_tensor.data, b_tensor.type, ldb, strideb,
c_tensor.data, c_tensor.type, ldc, stridec,
batch_size, computeType, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
stridea,
b_tensor.data,
b_tensor.type,
ldb,
strideb,
c_tensor.data,
c_tensor.type,
ldc,
stridec,
batch_size,
computeType,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda, stridea,
b_tensor.data, ldb, strideb,
c_tensor.data, ldc, stridec,
batch_size, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
lda,
stridea,
b_tensor.data,
ldb,
strideb,
c_tensor.data,
ldc,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, stridea,
b_tensor.data, strideb,
c_tensor.data, stridec,
batch_size, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
stridea,
b_tensor.data,
strideb,
c_tensor.data,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
b_tensor.data,
c_tensor.data,
batch_size, alpha, beta);
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
......@@ -692,9 +793,10 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
// The current SpGemm only supports TYPE_FP16 for T, computeType,
// but let us keep these template variables for later use.
template<typename T, DataType computeType>
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
{
TM_LOG_INFO(
"Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -709,47 +811,54 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
// A/B will be switched in SpGemm.
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream,
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
op_pair.transb);
compressMatrixB(
&b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_compressed, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_compressed, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
c_tensor.data);
......@@ -761,10 +870,14 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
}
template<typename T, DataType computeType>
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
......@@ -783,12 +896,8 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
......@@ -797,13 +906,12 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
// Switch A/B because Gemm expects column major layout as cublas does.
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
......@@ -814,32 +922,40 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
n,
m,
k,
b_tensor.data, ldb,
a_tensor.data, lda,
expected.data, ldc);
b_tensor.data,
ldb,
a_tensor.data,
lda,
expected.data,
ldc);
void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream,
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
op_pair.transb);
compressMatrixB(
&b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_compressed, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_compressed, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
}
......@@ -850,18 +966,16 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
}
#endif
int main(int argc, char* argv[]) {
int main(int argc, char* argv[])
{
// testGemmCreate();
using testcase_t = std::tuple<size_t, size_t, size_t>;
std::vector<testcase_t> testcases = {{16, 32, 64},
{255, 255, 255},
{1041, 2047, 9999},
{1041, 1, 9999},
{1041, 999, 1}};
std::vector<testcase_t> testcases = {
{16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}};
// Computation correctness tests
for (testcase_t &tc : testcases) {
for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc);
......@@ -896,7 +1010,7 @@ int main(int argc, char* argv[]) {
{16, 1024, 1024},
{1024, 1024, 1024}});
for (testcase_t &tc : testcases) {
for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc);
......
......@@ -5,10 +5,10 @@
#include <string>
#include <vector>
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm>
#include <iostream>
......@@ -39,13 +39,14 @@ protected:
void testTransposition();
};
void fill_tensor_random(Tensor a) {
void fill_tensor_random(Tensor a)
{
const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
std::uniform_int_distribution<int8_t> int8_random(-128, 127);
std::mt19937 rng(0);
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
}
......@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
int8_t *a_data, *a_t_data;
cudaMalloc(&a_data, m * k * sizeof(int8_t));
Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
fill_tensor_random(a);
cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
std::vector<int8_t> a_t_host_ref(a_t.size());
reference_transpose_host(a_t_host_ref, a);
......
#include <assert.h>
#include <math.h>
#include <float.h>
#include <math.h>
#include <stdexcept>
#include <tuple>
#include <vector>
#ifdef __linux__
#include <sys/time.h>
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -24,15 +25,19 @@ struct LogProbKernelTestParam {
size_t vocab_size;
size_t beam_width;
std::string toString() {
std::string toString()
{
return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
max_input_length, batch_size, vocab_size, beam_width);
max_input_length,
batch_size,
vocab_size,
beam_width);
}
};
/////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T>
class LogProbKernelTest : public FtTestBase {
class LogProbKernelTest: public FtTestBase {
protected:
void computeCumLogProbs(float* cum_log_probs,
......@@ -122,8 +127,8 @@ protected:
}
public:
void runTest(LogProbKernelTestParam param) {
void runTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
......@@ -146,8 +151,8 @@ public:
// device buffers
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -189,7 +194,8 @@ public:
delete[] h_logits;
}
void runBatchFirstTest(LogProbKernelTestParam param) {
void runBatchFirstTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
......@@ -213,8 +219,8 @@ public:
// device buffers
T* d_logits =
reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -256,10 +262,8 @@ public:
delete[] h_input_ids;
delete[] h_logits;
}
};
TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
TYPED_TEST(LogProbKernelTest, SingleStep)
......
......@@ -23,15 +23,15 @@
#include <unordered_map>
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "gtest_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
using namespace turbomind;
......@@ -41,13 +41,17 @@ struct TemperatureTestParam {
float* temperatures;
size_t temperatures_size;
std::string toString() {
std::string toString()
{
return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
batch_size,
vocab_size,
arr2str(temperatures, temperatures_size).c_str());
}
};
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
{
return (vocab_size + pad - 1) / pad * pad;
}
......@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
int token_id = output_ids[i + t * batch_size];
if (!penalized[token_id]) {
float logit = static_cast<float>(logits[offset + token_id]);
logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
logit * repetition_penalty : logit / repetition_penalty);
logits[offset + token_id] =
static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
penalized[token_id] = true;
}
}
......@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
}
template<typename T>
void initLogitsAndBias(T* logits,
T* bias,
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded)
void initLogitsAndBias(
T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
{
initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
if (bias != nullptr) {
......@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
}
}
/////////////////////////////////// Tests //////////////////////////////////////////
template<typename T>
class TemperaturePenaltyTest : public FtTestBase {
class TemperaturePenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -157,7 +157,8 @@ protected:
float* d_temperatures_;
void subsetup(TemperatureTestParam param) {
void subsetup(TemperatureTestParam param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
......@@ -177,7 +178,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
}
......@@ -204,29 +206,18 @@ protected:
}
}
public:
void runTest(TemperatureTestParam param)
{
subsetup(param);
// Do test
if (param.temperatures_size == 1) {
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
}
else {
invokeBatchApplyTemperaturePenalty(d_logits_,
d_bias_,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeBatchApplyTemperaturePenalty(
d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
}
computeReference(h_logits_,
h_bias_,
......@@ -240,19 +231,15 @@ public:
subteardown();
}
void runConsistencyTest(TemperatureTestParam param) {
void runConsistencyTest(TemperatureTestParam param)
{
// Set up test
ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
subsetup(param);
// Run a single runtime value case.
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
float temperature = param.temperatures[0];
float* h_temperatures = new float[batch_size_];
......@@ -267,14 +254,10 @@ public:
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
invokeBatchApplyTemperaturePenalty(d_logits_batch,
d_bias_batch,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
invokeBatchApplyTemperaturePenalty(
d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
bool passed =
checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
EXPECT_TRUE(passed);
// Tear down test
......@@ -348,7 +331,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, temperatures, batch_size});
}
......@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
size_t repetition_penalties_size;
RepetitionPenaltyType repetition_penalty_type;
std::string toString() {
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
std::string toString()
{
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
{RepetitionPenaltyType::Additive, "additive"},
{RepetitionPenaltyType::Multiplicative, "multiplicative"},
{RepetitionPenaltyType::None, "none"}};
return fmtstr(
"RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"repetition_penalties=%s, repetition_penalty_type=%s]",
batch_size, vocab_size, max_input_length,
batch_size,
vocab_size,
max_input_length,
arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
}
};
template<typename T>
class RepetitionPenaltyTest : public FtTestBase {
class RepetitionPenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -404,7 +389,8 @@ protected:
float* d_repetition_penalties_;
void subsetup(RepetitionPenaltyTestCase param) {
void subsetup(RepetitionPenaltyTestCase param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
......@@ -437,7 +423,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
delete[] h_output_ids_;
......@@ -540,7 +527,8 @@ public:
subteardown();
}
void runConsistencyTest(RepetitionPenaltyTestCase param) {
void runConsistencyTest(RepetitionPenaltyTestCase param)
{
// Set up test
ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
subsetup(param);
......@@ -651,7 +639,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
}
......@@ -667,7 +655,7 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......@@ -683,7 +671,7 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......
......@@ -12,6 +12,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......
......@@ -5,8 +5,8 @@
#include <string> // std::string
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
......@@ -14,6 +14,7 @@
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
this->runTest({6, 4, 1, 4, 1.0f, 1});
};
TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
{
this->runTest({16, 51200, 1, 63, 1.0f, 8});
......@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
};
template<typename T>
class TopPSamplingKernelTest: public SamplingKernelTest<T> {
......@@ -496,8 +495,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
......@@ -553,12 +552,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeTopPSampling<T>(workspace,
workspace_size,
......@@ -647,8 +641,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
......@@ -709,12 +703,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeBatchTopPSampling<T>(workspace,
workspace_size,
......@@ -825,15 +814,16 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
};
__global__
void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
{
int idx = threadIdx.x;
if (idx < batch_size) {
vals[idx] = curand(states + idx);
}
}
TEST(SamplingKernelTest, CurandBatchInitialize) {
TEST(SamplingKernelTest, CurandBatchInitialize)
{
size_t batch_size = 127;
cudaStream_t stream;
cudaStreamCreate(&stream);
......@@ -847,8 +837,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
}
unsigned long long* d_random_seeds;
check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
check_cuda_error(
cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
// Initialize curand states.
invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
......@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
unsigned int* h_rand_vals = new unsigned int[batch_size];
check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
check_cuda_error(cudaMemcpyAsync(
h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(
cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(cudaStreamSynchronize(stream));
// The same seed produces the same random number.
......
......@@ -5,17 +5,18 @@
#include <string> // std::string
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "gtest_utils.h"
......@@ -29,14 +30,21 @@ struct SamplingLayerTestParam {
float top_p;
size_t output_len;
std::string toString() {
std::string toString()
{
return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
batch_size, vocab_size, beam_width, top_k, top_p, output_len);
batch_size,
vocab_size,
beam_width,
top_k,
top_p,
output_len);
}
};
template<typename T>
void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits.
// logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension)
......@@ -53,7 +61,8 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
}
template<typename T>
void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits.
// logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension)
......@@ -91,9 +100,9 @@ protected:
ft::Allocator<ft::AllocatorType::CUDA>* allocator;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
std::mutex *cublas_wrapper_mutex;
cublasMMWrapper *cublas_wrapper;
DynamicDecodeLayer<T> *dynamic_decode_layer;
std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper;
DynamicDecodeLayer<T>* dynamic_decode_layer;
int* h_output_ids;
T* h_logits;
......@@ -109,7 +118,8 @@ protected:
int* d_output_ids;
int* d_end_ids;
void setup(unsigned long long seed = 0) {
void setup(unsigned long long seed = 0)
{
this->seed = seed;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -124,12 +134,8 @@ protected:
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
allocator);
cublas_wrapper = new cublasMMWrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
......@@ -168,7 +174,8 @@ protected:
deviceFill(d_end_ids, batchxbeam, end_id, stream);
}
void teardown() {
void teardown()
{
delete[] test_input_logits;
delete[] h_output_ids;
delete[] h_logits;
......@@ -185,12 +192,8 @@ protected:
check_cuda_error(cudaStreamDestroy(stream));
}
TensorMap* createInputTensors(int* topk,
size_t topk_size,
float* topp,
size_t topp_size,
float* temperature,
float* repetition_penalty)
TensorMap* createInputTensors(
int* topk, size_t topk_size, float* topp, size_t topp_size, float* temperature, float* repetition_penalty)
{
// construct common input tensors
TensorMap* input_tensors = new TensorMap();
......@@ -206,16 +209,19 @@ protected:
if (repetition_penalty != nullptr) {
input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
}
input_tensors->insert({"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
input_tensors->insert(
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
input_tensors->insert({"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
input_tensors->insert(
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
return input_tensors;
}
TensorMap* createOutputTensors() {
TensorMap* createOutputTensors()
{
// construct common output tensors
TensorMap* output_tensors = new TensorMap();
output_tensors->insert(
......@@ -226,18 +232,19 @@ protected:
output_tensors->insert(
{"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
output_tensors->insert(
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
output_tensors->insert({"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
return output_tensors;
}
void batchH2Dcpy(T* dst, T* src, size_t m, size_t n) {
void batchH2Dcpy(T* dst, T* src, size_t m, size_t n)
{
for (size_t i = 0; i < m; ++i) {
cudaH2Dcpy(dst + i * n, src, n);
}
}
bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids) {
bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids)
{
assert(expected_ids.size() == max_seq_len * batchxbeam);
int* h_output_ids = new int[max_seq_len * batchxbeam];
cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
......@@ -260,8 +267,8 @@ protected:
++failures;
}
}
TM_LOG_DEBUG("check...%6s : failures: %d / %d",
failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
TM_LOG_DEBUG(
"check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
delete[] h_output_ids;
return failures == 0;
}
......@@ -281,8 +288,8 @@ public:
for (unsigned long long seed = 0; seed < max_seed; ++seed) {
this->setup(seed);
size_t step = max_input_len;
TensorMap* input_tensors = createInputTensors(
top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
TensorMap* input_tensors =
createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
......@@ -317,12 +324,27 @@ TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes);
TYPED_TEST(SamplingDecodeTest, TopK)
{
int top_k = 2;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
// 0 1 2 3 4 5
{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, // step 0
{4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, // step 1
{2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3} // step 2
{0, 1},
{0, 1},
{0, 1},
{0, 1},
{0, 1},
{0, 1}, // step 0
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5}, // step 1
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
}
......@@ -331,12 +353,27 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
// 0 1 2 3 4 5
{0, 1}, {0}, {0}, {0, 1}, {0}, {0}, // step 0
{4, 5}, {4}, {4}, {4, 5}, {4}, {4}, // step 1
{2, 3}, {2}, {2}, {2, 3}, {2}, {2} // step 2
{0, 1},
{0},
{0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2},
{2},
{2, 3},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks;
......@@ -345,11 +382,26 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK)
TYPED_TEST(SamplingDecodeTest, TopP)
{
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
}
......@@ -358,38 +410,83 @@ TYPED_TEST(SamplingDecodeTest, BatchTopP)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids {
{0}, {0, 1}, {0, 1}, {0}, {0, 1}, {0, 1}, // step 0
{4}, {4, 5}, {4, 5}, {4}, {4, 5}, {4, 5}, // step 1
{2}, {2, 3}, {2, 3}, {2}, {2, 3}, {2, 3} // step 2
std::vector<std::set<int>> expected_output_ids{
{0},
{0, 1},
{0, 1},
{0},
{0, 1},
{0, 1}, // step 0
{4},
{4, 5},
{4, 5},
{4},
{4, 5},
{4, 5}, // step 1
{2},
{2, 3},
{2, 3},
{2},
{2, 3},
{2, 3} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, TopKTopP) {
TYPED_TEST(SamplingDecodeTest, TopKTopP)
{
int top_k = 2;
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
......@@ -400,11 +497,26 @@ TYPED_TEST(SamplingDecodeTest, TopKBatchTopP)
size_t batch_size = this->batch_size;
int top_k = 2;
float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2
{0, 1},
{0},
{0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
......@@ -415,11 +527,26 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2
{0, 1},
{0},
{0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks;
......@@ -430,11 +557,26 @@ TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK)
{
size_t batch_size = this->batch_size;
int top_k = 0;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
}
......@@ -443,11 +585,26 @@ TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP)
{
size_t batch_size = this->batch_size;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
}
......@@ -457,133 +614,292 @@ TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP)
size_t batch_size = this->batch_size;
int top_k = 0;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP)
{
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0}, {0}, {0, 1}, {0}, // step 0
{4, 5}, {4}, {4}, {4}, {4, 5}, {4}, // step 1
{2, 3}, {2}, {2}, {2}, {2, 3}, {2} // step 2
{0, 1},
{0},
{0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2},
{2},
{2},
{2, 3},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0, 1}, {0}, {0, 1}, {0}, {0}, // step 0
{4, 5}, {4, 5}, {4}, {4, 5}, {4}, {4}, // step 1
{2, 3}, {2, 3}, {2}, {2, 3}, {2}, {2} // step 2
{0, 1},
{0, 1},
{0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2, 3},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
float top_p = 0.0;
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0, 1}, {0}, {0}, {0, 1}, {0}, // step 0
{4, 5}, {4, 5}, {4}, {4}, {4, 5}, {4}, // step 1
{2, 3}, {2, 3}, {2}, {2}, {2, 3}, {2} // step 2
{0, 1},
{0, 1},
{0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2},
{2, 3},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0, 1}, {0}, {0}, {0, 1}, // step 0
{4}, {4}, {4, 5}, {4}, {4}, {4, 5}, // step 1
{2}, {2}, {2, 3}, {2}, {2}, {2, 3} // step 2
{0},
{0},
{0, 1},
{0},
{0},
{0, 1}, // step 0
{4},
{4},
{4, 5},
{4},
{4},
{4, 5}, // step 1
{2},
{2},
{2, 3},
{2},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero) {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0, 1}, {0}, {0, 1}, // step 0
{4}, {4}, {4}, {4, 5}, {4}, {4, 5}, // step 1
{2}, {2}, {2}, {2, 3}, {2}, {2, 3} // step 2
{0},
{0},
{0},
{0, 1},
{0},
{0, 1}, // step 0
{4},
{4},
{4},
{4, 5},
{4},
{4, 5}, // step 1
{2},
{2},
{2},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks;
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP) {
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids {
{0}, {0}, {0, 1}, {0}, {0}, {0}, // step 0
{0}, {0}, {4, 5}, {4}, {0}, {0}, // step 1
{0}, {0}, {2, 3}, {2}, {0}, {0} // step 2
std::vector<std::set<int>> expected_output_ids{
{0},
{0},
{0, 1},
{0},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2},
{0},
{0} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP) {
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0, 1}, {0, 1}, {0}, {0}, // step 0
{0}, {0}, {4, 5}, {4, 5}, {0}, {0}, // step 1
{0}, {0}, {2, 3}, {2, 3}, {0}, {0} // step 2
{0},
{0},
{0, 1},
{0, 1},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4, 5},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2, 3},
{0},
{0} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ks;
......@@ -603,13 +919,8 @@ public:
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublas_algo_map = new cublasAlgoMap("");
cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle,
cublaslt_handle,
stream,
cublas_algo_map,
cublas_wrapper_mutex,
allocator);
cublas_wrapper = new cublasMMWrapper(
cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, allocator);
}
void TearDown() override
{
......@@ -632,7 +943,6 @@ protected:
std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper;
DataType data_type = getTensorType<T>();
size_t batch_size;
......@@ -694,14 +1004,13 @@ protected:
deviceFill(d_end_ids, batch_size, end_id);
}
void teardown() {
void teardown()
{
delete[] h_logits;
delete[] h_output_ids;
}
void runCurandTest(SamplingLayerTestParam param,
bool use_local_batch,
bool use_single_random_seed)
void runCurandTest(SamplingLayerTestParam param, bool use_local_batch, bool use_single_random_seed)
{
setup(param);
const DataType data_type = getTensorType<T>();
......@@ -709,7 +1018,7 @@ protected:
const size_t local_batch_size = use_local_batch ? 3 : batch_size;
assert(batch_size % local_batch_size == 0);
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
end_id,
stream,
......@@ -739,8 +1048,8 @@ protected:
cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size);
for (uint ite = 0; ite < iteration_num; ++ite) {
TensorMap dynamic_decode_input_tensors({
{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
TensorMap dynamic_decode_input_tensors(
{{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
......@@ -750,18 +1059,16 @@ protected:
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}
});
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}});
// common outputs
TensorMap dynamic_decode_output_tensors({
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
TensorMap dynamic_decode_output_tensors(
{{"output_ids",
Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}
});
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors);
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
sync_check_cuda_error();
// check results.
......@@ -774,7 +1081,11 @@ protected:
for (size_t j = 1; j < period_size; ++j) {
EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j])
<< fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d",
step, i, h_output_ids[i], i + j, h_output_ids[i + j]);
step,
i,
h_output_ids[i],
i + j,
h_output_ids[i + j]);
}
}
}
......@@ -783,11 +1094,12 @@ protected:
teardown();
}
void runCumLogProbTest(SamplingLayerTestParam param) {
void runCumLogProbTest(SamplingLayerTestParam param)
{
setup(param);
unsigned long long seed = 43;
const DataType data_type = getTensorType<T>();
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
end_id,
stream,
......@@ -810,10 +1122,11 @@ protected:
int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
float* output_log_probs = reinterpret_cast<float*>(
allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
float* output_log_probs =
reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
int* output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
int* output_ids =
reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
int* h_output_ids = new int[batch_size * beam_width];
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
......@@ -824,26 +1137,23 @@ protected:
cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
TensorMap input_tensors({
{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
TensorMap input_tensors({{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
for (size_t step = max_input_len; step < max_output_len; ++step) {
uint ite = 0;
// Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size);
TensorMap dynamic_decode_input_tensors({
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
TensorMap dynamic_decode_input_tensors(
{{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths",
Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
......@@ -851,38 +1161,40 @@ protected:
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
// common outputs
TensorMap dynamic_decode_output_tensors({
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
TensorMap dynamic_decode_output_tensors(
{{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
{"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors);
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(h_output_ids,
dynamic_decode_output_tensors
.at("output_ids")
.getPtrWithOffset<int>(step * (batch_size * beam_width)),
cudaD2Hcpy(
h_output_ids,
dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset<int>(step * (batch_size * beam_width)),
batch_size * beam_width);
cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx];
TM_LOG_DEBUG(
"| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step, (int)i, (int)idx, (int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
(int)step,
(int)i,
(int)idx,
(int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i],
(float)h_log_probs[idx],
h_cum_log_probs[i],
expected_cum_log_probs[i],
(float)h_probs[idx]);
}
TM_LOG_DEBUG("");
}
......@@ -898,7 +1210,6 @@ protected:
delete dynamic_decode_layer;
}
};
TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes);
......
#include <iostream>
#include <vector>
#include <unordered_map>
#include <vector>
#include <gtest/gtest.h>
......@@ -16,9 +16,10 @@ namespace {
EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \
} while(false)
} while (false)
TEST(TensorMapTest, HasKeyCorrectness) {
TEST(TensorMapTest, HasKeyCorrectness)
{
bool* v1 = new bool(true);
float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
......@@ -33,7 +34,8 @@ TEST(TensorMapTest, HasKeyCorrectness) {
delete[] v2;
}
TEST(TensorMapTest, InsertCorrectness) {
TEST(TensorMapTest, InsertCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
float* v2 = new float[2]{1.0f, 2.0f};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
......@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
EXPECT_FALSE(map.isExist("t2"));
}
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
{
TensorMap map;
EXPECT_TRUE(map.size() == 0);
// forbid a none tensor.
......@@ -57,7 +60,8 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
}
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
......@@ -68,7 +72,8 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
delete[] v1;
}
TEST(TensorMapTest, GetValCorrectness) {
TEST(TensorMapTest, GetValCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
......@@ -93,7 +98,8 @@ TEST(TensorMapTest, GetValCorrectness) {
delete[] v1;
}
TEST(TensorMapTest, GetTensorCorrectness) {
TEST(TensorMapTest, GetTensorCorrectness)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
......@@ -114,7 +120,8 @@ TEST(TensorMapTest, GetTensorCorrectness) {
delete[] t1_val;
}
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
......@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
delete[] t1_val;
}
TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
TEST(TensorTest, EmptyTensorMinMaxRaiseError)
{
Tensor t1;
EXPECT_THROW(t1.min<int>(), std::runtime_error);
EXPECT_THROW(t1.max<int>(), std::runtime_error);
......@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
EXPECT_THROW(t2.max<int>(), std::runtime_error);
}
using TensorTypes = testing::Types<int8_t, int, float>;
template <typename T>
class TensorFuncTest : public testing::Test {};
template<typename T>
class TensorFuncTest: public testing::Test {};
TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
TYPED_TEST(TensorFuncTest, MaxCorrectness) {
TYPED_TEST(TensorFuncTest, MaxCorrectness)
{
using T = TypeParam;
size_t size = 4;
T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
......@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, MinCorrectness) {
TYPED_TEST(TensorFuncTest, MinCorrectness)
{
using T = TypeParam;
size_t size = 4;
......@@ -197,7 +206,8 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, AnyCorrectness) {
TYPED_TEST(TensorFuncTest, AnyCorrectness)
{
using T = TypeParam;
T* v = new T[4]{T(1), T(2), T(3), T(4)};
......@@ -207,7 +217,8 @@ TYPED_TEST(TensorFuncTest, AnyCorrectness) {
delete[] v;
}
TYPED_TEST(TensorFuncTest, AllCorrectness) {
TYPED_TEST(TensorFuncTest, AllCorrectness)
{
using T = TypeParam;
constexpr size_t size = 4;
......@@ -221,7 +232,8 @@ TYPED_TEST(TensorFuncTest, AllCorrectness) {
delete[] v2;
}
TYPED_TEST(TensorFuncTest, SliceCorrectness) {
TYPED_TEST(TensorFuncTest, SliceCorrectness)
{
using T = TypeParam;
constexpr int size = 12;
......
......@@ -20,8 +20,8 @@
#include <assert.h> // assert
#include <float.h> // FLT_MAX
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <limits> // numeric_limits
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // string
#include <vector> // vector
......@@ -36,32 +36,37 @@
using namespace turbomind;
class TestFailureError : public std::exception {
class TestFailureError: public std::exception {
private:
std::string msg_;
public:
explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") {
explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
}
const char* what () const throw () {
const char* what() const throw()
{
return msg_.c_str();
}
};
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
do { \
if (!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
} \
} while (false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
do { \
if (cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
} \
} while (false)
bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
{
......@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
}
template<typename T>
bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
{
size_t failures = 0;
float relative_gap = 0.0f;;
float relative_gap = 0.0f;
;
for (size_t i = 0; i < size; ++i) {
// The values for the output and the reference.
......@@ -109,14 +116,17 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size);
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
100. * failures / size, atol, rtol, 100. * relative_gap);
failures <= tol_failures ? "....OK" : "FAILED",
name.c_str(),
100. * failures / size,
atol,
rtol,
100. * relative_gap);
return failures <= tol_failures;
}
template<typename T>
bool checkResult(std::string name, T* out, T* ref, size_t size,
bool device_out = true, bool device_ref = false)
bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
{
bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f;
......@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
ref = h_ref;
}
bool is_ok = checkResult(name, out, ref, size, atol, rtol);
if (h_out != nullptr){
if (h_out != nullptr) {
delete[] h_out;
}
if (h_ref != nullptr) {
......@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
}
template<typename T>
void initRandom(T* ptr, size_t size, float minval, float maxval) {
void initRandom(T* ptr, size_t size, float minval, float maxval)
{
for (size_t i = 0; i < size; ++i) {
float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
val *= (maxval - minval);
......@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
}
}
void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
void initRandomInt(int* ptr, size_t size, int minval, int maxval)
{
assert(minval < maxval);
int mod = maxval - minval;
for (size_t i = 0; i < size; ++i) {
......@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
}
template<typename T>
void tile(T* x, int m, int n) {
void tile(T* x, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
x[i * n + j] = x[j];
......@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
}
template<typename T>
void tile(T* dst, T* src, int m, int n) {
void tile(T* dst, T* src, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
dst[i * n + j] = src[j];
......@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
#define HALF_FLT_MAX 65504.0f
template<typename T>
bool isHalf() {
bool isHalf()
{
return std::is_same<T, half>::value;
}
template<typename T>
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment