Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -24,6 +24,12 @@
namespace turbomind {
// cub.cuh brings windows.h
// should be included after cub.cuh
#ifdef ERROR
#undef ERROR
#endif
class Logger {
public:
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h"
......@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
}
// get slices
ConcateSlice slice0{.slices = {{0, dim0}}};
ConcateSlice slice1{.slices = {{0, dim1}}};
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0];
}
......
......@@ -15,6 +15,7 @@
*/
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/macro.h"
#include <atomic>
namespace turbomind {
......
......@@ -18,7 +18,7 @@
#include "nvtx_utils.h"
#ifdef USE_NVTX
#include "nvToolsExt.h"
#include "nvtx3/nvToolsExt.h"
#endif
namespace ft_nvtx {
......
......@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
const T* scales_ptr = get_ptr<const T>(scales);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor);
T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor);
cudaEvent_t start, stop;
......@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* bias_ptr = get_ptr<const T>(bias);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor);
T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor);
fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
......
......@@ -14,11 +14,11 @@
* limitations under the License.
*/
#include <chrono>
#include <cstdlib>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <chrono>
#include "torch/csrc/cuda/Stream.h"
#include <torch/custom_class.h>
......@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
namespace ft = turbomind;
template<typename T>
void int8_gemm_test(
const int m,
const int n,
const int k,
const at::ScalarType output_data_type,
const QuantMode quant_mode,
const int iters)
void int8_gemm_test(const int m,
const int n,
const int k,
const at::ScalarType output_data_type,
const QuantMode quant_mode,
const int iters)
{
const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerChannelQuant;
const bool per_token_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
const int row_scale_size = per_token_quant ? m : 1;
const int col_scale_size = per_channel_quant ? n : 1;
......@@ -76,16 +75,16 @@ void int8_gemm_test(
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
......@@ -101,40 +100,41 @@ void int8_gemm_test(
auto stream = at::cuda::getCurrentCUDAStream().stream();
// warm_up
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
"y_gpu_int32.npy");
ft::check_cuda_error(cudaStreamSynchronize(stream));
auto start = high_resolution_clock::now();
for (int i = 0; i < iters; ++i) {
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
}
ft::check_cuda_error(cudaStreamSynchronize(stream));
......@@ -142,27 +142,30 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
if (torch::allclose(
(y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else {
}
else {
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
// alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
}
}
int main(int argc, char **argv)
int main(int argc, char** argv)
{
if (argc != 7) {
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
TM_LOG_ERROR(
"arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0;
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
at::ScalarType::Half : at::ScalarType::BFloat16;
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
if (quant_mode == QuantMode::PerChannelQuant) {
printf("per channel quant \n");
}
......@@ -170,7 +173,8 @@ int main(int argc, char **argv)
if (output_data_type == at::ScalarType::Half) {
int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
} else {
}
else {
#if ENABLE_BF16
int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
#endif
......
......@@ -20,7 +20,12 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1
)
add_definitions(-DTORCH_CUDA=1)
find_package(CUDAToolkit REQUIRED)
if (NOT MSVC)
add_definitions(-DTORCH_CUDA=1)
endif()
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
......@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
# Sorted by alphabetical order of test name.
target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC
-lcudart -lcurand
CUDA::cudart CUDA::curand
gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC
-lcudart
CUDA::cudart
logprob_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC
-lcudart
CUDA::cudart
sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
target_link_libraries( # Libs for test_sampling_layer
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
cublasMMWrapper memory_utils
DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
target_link_libraries( # Libs for test_tensor
......@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor
remove_definitions(-DTORCH_CUDA=1)
add_executable(test_gemm test_gemm.cu)
target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC
......@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart
Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
unfused_attention_kernels
memory_utils tensor cublasMMWrapper cuda_utils logger)
......@@ -14,13 +14,12 @@
* limitations under the License.
*/
#include "gtest_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "gtest_utils.h"
#include <curand.h>
#include <sstream>
......
......@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
// compute actual
using AttentionOp = FlashAttentionOp<scalar_t>;
using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = seq_len * size_per_head};
Layout layout_k{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_v{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_o{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = num_heads * size_per_head,
.stride_head = size_per_head,
.use_seqlens = true};
Layout layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
Layout layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
typename AttentionOp::Params attn_params{.attn_out = actual_out_ptr,
.query = query_ptr,
.key = key_ptr,
.val = val_ptr,
.mask = mask_ptr,
.out_accum = accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr,
.cu_seqlens_k = nullptr,
.group_size = 1,
.layout_q = layout_q,
.layout_k = layout_k,
.layout_v = layout_v,
.layout_o = layout_o};
typename AttentionOp::Params attn_params{actual_out_ptr,
query_ptr,
key_ptr,
val_ptr,
mask_ptr,
accum_buf_ptr,
cu_seqlens_ptr,
nullptr,
1,
layout_q,
layout_k,
layout_v,
layout_o};
flash_attention(attn_params, stream);
sync_check_cuda_error();
......
#include <assert.h>
#include <math.h>
#include <cublas_v2.h>
#include <math.h>
#include <numeric>
#include <stdexcept>
#include <tuple>
......@@ -18,35 +18,38 @@ using namespace turbomind;
// Can be replaced by the function provided by a test framework
class TestFailureError : public std::exception {
class TestFailureError: public std::exception {
private:
std::string msg_;
public:
explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") {
explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
}
const char* what () const throw () {
const char* what() const throw()
{
return msg_.c_str();
}
};
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \
do { \
bool is_ok = checkResult<dtype,ctype>(name, out, ref); \
if(!is_ok) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \
__func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while(false)
#define EXPECT_TRUE(cond) \
do { \
if (!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \
do { \
bool is_ok = checkResult<dtype, ctype>(name, out, ref); \
if (!is_ok) { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
////////////////////////////////////////////////////////////////////////////////////
......@@ -58,28 +61,29 @@ private:
public:
std::vector<size_t> shape;
DataType type;
Tensor* tensor;
void* data;
DataType type;
Tensor* tensor;
void* data;
TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false)
{
this->allocator = allocator;
this->type = dtype;
this->shape = shape;
this->type = dtype;
this->shape = shape;
size_t tensor_memsize = this->memsize();
this->data = this->allocator->malloc(tensor_memsize, false);
this->data = this->allocator->malloc(tensor_memsize, false);
if (zero_init) {
check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
} else {
}
else {
setRandomValues();
}
this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
}
TensorWrapper(TensorWrapper const& other)
: allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
TensorWrapper(TensorWrapper const& other):
allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
{
TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
}
......@@ -91,13 +95,14 @@ public:
void setInvalidValues()
{
size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
size_t tensor_size = type_size * tensor->size();
// Fill by a random number to guarantee invalid values
check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
}
void setRandomValues() {
void setRandomValues()
{
// random initialization
size_t num_elements = this->size();
switch (this->type) {
......@@ -113,7 +118,8 @@ public:
}
}
size_t size() {
size_t size()
{
size_t n_elements = 1;
for (size_t s : this->shape) {
n_elements *= s;
......@@ -121,7 +127,8 @@ public:
return n_elements;
}
size_t memsize() {
size_t memsize()
{
size_t type_size = 0;
switch (this->type) {
case TYPE_FP32:
......@@ -138,13 +145,13 @@ public:
};
template<DataType computeType>
void computeReference(GemmOp transa,
GemmOp transb,
void computeReference(GemmOp transa,
GemmOp transb,
TensorWrapper& C,
TensorWrapper& A,
TensorWrapper& B,
float alpha = 1.0f,
float beta = 0.0f)
float alpha = 1.0f,
float beta = 0.0f)
{
size_t m = C.shape[0];
size_t n = C.shape[1];
......@@ -154,28 +161,36 @@ void computeReference(GemmOp transa,
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
half h_alpha = (half)alpha;
half h_beta = (half)beta;
const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
half h_alpha = (half)alpha;
half h_beta = (half)beta;
const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
check_cuda_error(cublasGemmEx(cublas_handle,
getCublasOperation(transb),
getCublasOperation(transa),
n, m, k,
n,
m,
k,
_alpha,
(const void*)B.data, btype, ldb,
(const void*)A.data, atype, lda,
(const void*)B.data,
btype,
ldb,
(const void*)A.data,
atype,
lda,
_beta,
(void*)C.data, ctype, ldc,
(void*)C.data,
ctype,
ldc,
compute_type,
CUBLAS_GEMM_DEFAULT));
check_cuda_error(cublasDestroy(cublas_handle));
......@@ -199,13 +214,14 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
}
template<typename T>
bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) {
bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol)
{
assert(out.type == ref.type);
size_t out_size = out.size();
size_t ref_size = ref.size();
T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost);
......@@ -219,7 +235,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
bool ok = almostEqual(a, b, atol, rtol);
// Print the error.
if( !ok && failures < 4 ) {
if (!ok && failures < 4) {
TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b);
......@@ -234,38 +250,46 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * out_size);
TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size, atol, rtol);
name.c_str(),
failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size,
atol,
rtol);
return failures <= tol_failures;
}
template<typename T, DataType computeType>
bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) {
float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
bool is_ok = false;
bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref)
{
float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
bool is_ok = false;
if (sizeof(T) == 4) {
is_ok = _checkResult<float>(name, out, ref, atol, rtol);
} else {
}
else {
is_ok = _checkResult<half>(name, out, ref, atol, rtol);
}
return is_ok;
}
template<typename T, DataType computeType>
bool checkResult(TensorWrapper& out, TensorWrapper& ref) {
bool checkResult(TensorWrapper& out, TensorWrapper& ref)
{
return checkResult<T, computeType>("", out, ref);
}
template<typename T>
std::string toString() {
std::string toString()
{
std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16";
return str;
}
template<typename T, DataType ctype>
std::string toString() {
std::string toString()
{
std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16";
str += ", compute_type=";
......@@ -273,7 +297,8 @@ std::string toString() {
return str;
}
std::string toString(GemmOp op) {
std::string toString(GemmOp op)
{
return op == GEMM_OP_N ? "N" : "T";
}
......@@ -282,38 +307,38 @@ struct GemmOpPair {
GemmOp transb;
};
static const std::vector<GemmOpPair> op_pairs {{GEMM_OP_N, GEMM_OP_N},
{GEMM_OP_N, GEMM_OP_T},
{GEMM_OP_T, GEMM_OP_N},
{GEMM_OP_T, GEMM_OP_T}};
static const std::vector<GemmOpPair> op_pairs{
{GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}};
static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb,
size_t m, size_t n, size_t k)
static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k)
{
return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(),
m, n, k);
func_name,
getGemmOpString(transa).c_str(),
getGemmOpString(transb).c_str(),
m,
n,
k);
}
static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs,
size_t m, size_t n, size_t k)
static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k)
{
return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
}
/////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T, DataType computeType>
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
{
TM_LOG_INFO(
"Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
Allocator<AllocatorType::CUDA> allocator(getDevice());
DataType dtype = getTensorType<T>();
DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
......@@ -322,72 +347,80 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_tensor.data, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_tensor.data, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
check_cuda_error(cudaStreamDestroy(stream));
}
template<typename T, DataType computeType>
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
TM_LOG_INFO(
"Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
DataType dtype = getTensorType<T>();
DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
TensorWrapper expected(&allocator, dtype, {m, n}, true);
cublasHandle_t cublas_handle;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
......@@ -396,7 +429,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does.
......@@ -405,33 +438,50 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
size_t ldc = n;
cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
getCublasOperation(op_pair.transa),
n, m, k,
b_tensor.data, ldb,
a_tensor.data, lda,
expected.data, ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_tensor.data, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
n,
m,
k,
b_tensor.data,
ldb,
a_tensor.data,
lda,
expected.data,
ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_tensor.data, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
......@@ -442,24 +492,28 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
}
template<typename T, DataType computeType>
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
// batch of in/out tensors
DataType a_type = getTensorType<T>();
DataType b_type = getTensorType<T>();
DataType c_type = getTensorType<T>();
DataType a_type = getTensorType<T>();
DataType b_type = getTensorType<T>();
DataType c_type = getTensorType<T>();
std::vector<TensorWrapper*> a_tensors;
std::vector<TensorWrapper*> b_tensors;
std::vector<TensorWrapper*> c_tensors;
std::vector<TensorWrapper*> expecteds;
const size_t batch_size = 3;
const size_t batch_size = 3;
for (size_t i = 0; i < batch_size; ++i) {
a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false));
b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false));
......@@ -484,26 +538,21 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
(const T*)expecteds[2]->data};
T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
check_cuda_error(cudaMemcpyAsync(
(void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
cublasHandle_t cublas_handle;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
......@@ -512,7 +561,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_type, b_type, c_type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
......@@ -526,42 +575,51 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
n,
m,
k,
(const void* const*)batch_b, ldb,
(const void* const*)batch_a, lda,
(void* const*)batch_expected, ldc,
(const void* const*)batch_b,
ldb,
(const void* const*)batch_a,
lda,
(void* const*)batch_expected,
ldc,
batch_size);
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, a_type, lda,
batch_b, b_type, ldb,
batch_c, c_type, ldc,
gemm->batchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
batch_a,
a_type,
lda,
batch_b,
b_type,
ldb,
batch_c,
c_type,
ldc,
batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api1 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues();
}
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, lda,
batch_b, ldb,
batch_c, ldc,
batch_size);
gemm->batchedGemm(
op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api2 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues();
}
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
batch_a, batch_b, batch_c, batch_size);
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size);
for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api3 batch" + std::to_string(i),
T, computeType, *c_tensors[i], *expecteds[i]);
EXPECT_ALMOST_EQUAL(
tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
}
}
a_tensors.clear();
......@@ -574,36 +632,36 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cudaStreamDestroy(stream));
}
template<typename T, DataType computeType>
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
batch_size, m, n, k, toString<T, computeType>().c_str());
batch_size,
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
DataType data_type = getTensorType<T>();
DataType data_type = getTensorType<T>();
TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false);
TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false);
TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true);
TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true);
cublasHandle_t cublas_handle;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
......@@ -612,7 +670,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does.
......@@ -625,7 +683,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
int64_t stridec = m * n;
float alpha = 1.0f;
float beta = 0.0f;
float beta = 0.0f;
cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb),
getCublasOperation(op_pair.transa),
......@@ -650,35 +708,78 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
getCublasDataType(computeType));
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda, stridea,
b_tensor.data, b_tensor.type, ldb, strideb,
c_tensor.data, c_tensor.type, ldc, stridec,
batch_size, computeType, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
stridea,
b_tensor.data,
b_tensor.type,
ldb,
strideb,
c_tensor.data,
c_tensor.type,
ldc,
stridec,
batch_size,
computeType,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda, stridea,
b_tensor.data, ldb, strideb,
c_tensor.data, ldc, stridec,
batch_size, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
lda,
stridea,
b_tensor.data,
ldb,
strideb,
c_tensor.data,
ldc,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, stridea,
b_tensor.data, strideb,
c_tensor.data, stridec,
batch_size, alpha, beta);
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
stridea,
b_tensor.data,
strideb,
c_tensor.data,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
b_tensor.data,
c_tensor.data,
batch_size, alpha, beta);
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
}
......@@ -692,15 +793,16 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
// The current SpGemm only supports TYPE_FP16 for T, computeType,
// but let us keep these template variables for later use.
template<typename T, DataType computeType>
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
{
TM_LOG_INFO(
"Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
Allocator<AllocatorType::CUDA> allocator(getDevice());
DataType dtype = getTensorType<T>();
DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
......@@ -709,47 +811,54 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
// A/B will be switched in SpGemm.
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream,
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
op_pair.transb);
compressMatrixB(
&b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_compressed, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_compressed, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
c_tensor.data);
......@@ -761,34 +870,34 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
}
template<typename T, DataType computeType>
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str());
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream;
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
DataType dtype = getTensorType<T>();
DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
TensorWrapper expected(&allocator, dtype, {m, n}, true);
cublasHandle_t cublas_handle;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
......@@ -797,13 +906,12 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) {
for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream,
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
// Switch A/B because Gemm expects column major layout as cublas does.
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
......@@ -814,32 +922,40 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
n,
m,
k,
b_tensor.data, ldb,
a_tensor.data, lda,
expected.data, ldc);
b_tensor.data,
ldb,
a_tensor.data,
lda,
expected.data,
ldc);
void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream,
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
op_pair.transb);
compressMatrixB(
&b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, a_tensor.type, lda,
b_compressed, b_tensor.type, ldb,
c_tensor.data, c_tensor.type, ldc);
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
a_tensor.data, b_compressed, c_tensor.data);
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
}
......@@ -850,18 +966,16 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
}
#endif
int main(int argc, char* argv[]) {
int main(int argc, char* argv[])
{
// testGemmCreate();
using testcase_t = std::tuple<size_t, size_t, size_t>;
std::vector<testcase_t> testcases = {{16, 32, 64},
{255, 255, 255},
{1041, 2047, 9999},
{1041, 1, 9999},
{1041, 999, 1}};
std::vector<testcase_t> testcases = {
{16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}};
// Computation correctness tests
for (testcase_t &tc : testcases) {
for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc);
......@@ -887,16 +1001,16 @@ int main(int argc, char* argv[]) {
// Reset for SpGemm test.
testcases.clear();
testcases.insert(testcases.end(),
{{8, 32, 32}, // minimum possible example.
{8, 32, 64},
{64, 64, 64},
{16, 32, 64},
{1024, 32, 1024},
{1024, 1024, 32},
{16, 1024, 1024},
{1024, 1024, 1024}});
for (testcase_t &tc : testcases) {
{{8, 32, 32}, // minimum possible example.
{8, 32, 64},
{64, 64, 64},
{16, 32, 64},
{1024, 32, 1024},
{1024, 1024, 32},
{16, 1024, 1024},
{1024, 1024, 1024}});
for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc);
......
......@@ -5,10 +5,10 @@
#include <string>
#include <vector>
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm>
#include <iostream>
......@@ -39,13 +39,14 @@ protected:
void testTransposition();
};
void fill_tensor_random(Tensor a) {
const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
void fill_tensor_random(Tensor a)
{
const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
std::uniform_int_distribution<int8_t> int8_random(-128, 127);
std::mt19937 rng(0);
std::mt19937 rng(0);
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
}
......@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
int8_t *a_data, *a_t_data;
cudaMalloc(&a_data, m * k * sizeof(int8_t));
Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
fill_tensor_random(a);
cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
std::vector<int8_t> a_t_host_ref(a_t.size());
reference_transpose_host(a_t_host_ref, a);
......
#include <assert.h>
#include <math.h>
#include <float.h>
#include <math.h>
#include <stdexcept>
#include <tuple>
#include <vector>
#ifdef __linux__
#include <sys/time.h>
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
size_t vocab_size;
size_t beam_width;
std::string toString() {
std::string toString()
{
return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
max_input_length, batch_size, vocab_size, beam_width);
max_input_length,
batch_size,
vocab_size,
beam_width);
}
};
/////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T>
class LogProbKernelTest : public FtTestBase {
class LogProbKernelTest: public FtTestBase {
protected:
void computeCumLogProbs(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
void computeCumLogProbs(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
......@@ -54,9 +59,9 @@ protected:
cum_log_probs[i] = 0.0f;
}
else if ((int)step < input_lengths[i]) {
size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX;
size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]);
if (val > max_logits) {
......@@ -67,7 +72,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits);
}
int token_id = input_ids[step * batch_size + i];
int token_id = input_ids[step * batch_size + i];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) {
log_probs[step * batch_size + i] = log_prob;
......@@ -78,11 +83,11 @@ protected:
}
}
void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
......@@ -98,8 +103,8 @@ protected:
cum_log_probs[i] = 0.0f;
}
else if ((int)step < input_lengths[i]) {
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX;
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]);
if (val > max_logits) {
......@@ -110,7 +115,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits);
}
int token_id = input_ids[i * max_input_length + step];
int token_id = input_ids[i * max_input_length + step];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) {
log_probs[i * max_input_length + step] = log_prob;
......@@ -122,17 +127,17 @@ protected:
}
public:
void runTest(LogProbKernelTestParam param) {
void runTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam];
T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam];
// output buffers
......@@ -145,9 +150,9 @@ public:
memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
// device buffers
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -157,7 +162,7 @@ public:
deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size);
void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs,
d_logits,
d_input_ids,
......@@ -189,16 +194,17 @@ public:
delete[] h_logits;
}
void runBatchFirstTest(LogProbKernelTestParam param) {
void runBatchFirstTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam];
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam];
// output buffers
......@@ -213,8 +219,8 @@ public:
// device buffers
T* d_logits =
reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -224,7 +230,7 @@ public:
check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size);
void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs,
d_logits,
d_input_ids,
......@@ -239,16 +245,16 @@ public:
true);
computeCumLogProbsBatchFirst(expected_cum_log_probs,
nullptr,
h_logits,
h_input_ids,
h_input_lengths,
max_input_length,
batchxbeam,
vocab_size,
vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
nullptr,
h_logits,
h_input_ids,
h_input_lengths,
max_input_length,
batchxbeam,
vocab_size,
vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed);
delete[] expected_cum_log_probs;
......@@ -256,10 +262,8 @@ public:
delete[] h_input_ids;
delete[] h_logits;
}
};
TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
TYPED_TEST(LogProbKernelTest, SingleStep)
......
......@@ -14,24 +14,24 @@
* limitations under the License.
*/
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdexcept>
#include <stdlib.h> // rand
#include <string> // std::string
#include <stdlib.h> // rand
#include <string> // std::string
#include <unordered_map>
#include <vector> // std::vector
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "gtest_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
using namespace turbomind;
......@@ -41,21 +41,25 @@ struct TemperatureTestParam {
float* temperatures;
size_t temperatures_size;
std::string toString() {
std::string toString()
{
return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
batch_size,
vocab_size,
arr2str(temperatures, temperatures_size).c_str());
}
};
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
{
return (vocab_size + pad - 1) / pad * pad;
}
template<typename T>
void applyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float repetition_penalty,
void applyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float repetition_penalty,
const size_t step,
const size_t max_input_length,
const size_t batch_size,
......@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
int token_id = output_ids[i + t * batch_size];
if (!penalized[token_id]) {
float logit = static_cast<float>(logits[offset + token_id]);
logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
logit * repetition_penalty : logit / repetition_penalty);
logits[offset + token_id] =
static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
penalized[token_id] = true;
}
}
......@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
}
template<typename T>
void batchApplyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
void batchApplyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float* repetition_penalties,
const size_t step,
const size_t max_input_length,
......@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
}
template<typename T>
void initLogitsAndBias(T* logits,
T* bias,
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded)
void initLogitsAndBias(
T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
{
initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
if (bias != nullptr) {
......@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
}
}
/////////////////////////////////// Tests //////////////////////////////////////////
template<typename T>
class TemperaturePenaltyTest : public FtTestBase {
class TemperaturePenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -157,17 +157,18 @@ protected:
float* d_temperatures_;
void subsetup(TemperatureTestParam param) {
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
void subsetup(TemperatureTestParam param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
if (param.temperatures_size > 1) {
......@@ -177,7 +178,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
}
......@@ -195,7 +197,7 @@ protected:
ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
for (size_t j = 0; j < vocab_size; ++j) {
size_t index = i * vocab_size_padded + j;
float logit = static_cast<float>(logits[index]);
float logit = static_cast<float>(logits[index]);
if (bias != nullptr) {
logit += static_cast<float>(bias[j]);
}
......@@ -204,29 +206,18 @@ protected:
}
}
public:
void runTest(TemperatureTestParam param)
{
subsetup(param);
// Do test
if (param.temperatures_size == 1) {
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
}
else {
invokeBatchApplyTemperaturePenalty(d_logits_,
d_bias_,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeBatchApplyTemperaturePenalty(
d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
}
computeReference(h_logits_,
h_bias_,
......@@ -240,21 +231,17 @@ public:
subteardown();
}
void runConsistencyTest(TemperatureTestParam param) {
void runConsistencyTest(TemperatureTestParam param)
{
// Set up test
ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
subsetup(param);
// Run a single runtime value case.
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
float temperature = param.temperatures[0];
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
float temperature = param.temperatures[0];
float* h_temperatures = new float[batch_size_];
for (size_t i = 0; i < batch_size_; ++i) {
h_temperatures[i] = temperature;
......@@ -263,18 +250,14 @@ public:
cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
invokeBatchApplyTemperaturePenalty(d_logits_batch,
d_bias_batch,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
invokeBatchApplyTemperaturePenalty(
d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
bool passed =
checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
EXPECT_TRUE(passed);
// Tear down test
......@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 1.0f;
......@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 0.53f;
......@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f;
......@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, temperatures, batch_size});
}
......@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
size_t repetition_penalties_size;
RepetitionPenaltyType repetition_penalty_type;
std::string toString() {
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
std::string toString()
{
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
{RepetitionPenaltyType::Additive, "additive"},
{RepetitionPenaltyType::Multiplicative, "multiplicative"},
{RepetitionPenaltyType::None, "none"}};
return fmtstr(
"RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"repetition_penalties=%s, repetition_penalty_type=%s]",
batch_size, vocab_size, max_input_length,
arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"repetition_penalties=%s, repetition_penalty_type=%s]",
batch_size,
vocab_size,
max_input_length,
arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
}
};
template<typename T>
class RepetitionPenaltyTest : public FtTestBase {
class RepetitionPenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -392,37 +377,38 @@ protected:
size_t sequence_length_;
size_t step_;
T* h_logits_;
T* h_bias_;
T* h_logits_;
T* h_bias_;
int* h_output_ids_;
int* h_input_lengths_;
T* d_logits_;
T* d_bias_;
T* d_logits_;
T* d_bias_;
int* d_output_ids_;
int* d_input_lengths_;
float* d_repetition_penalties_;
void subsetup(RepetitionPenaltyTestCase param) {
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
void subsetup(RepetitionPenaltyTestCase param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7;
max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7;
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_];
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_];
h_input_lengths_ = new int[batch_size_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
......@@ -437,7 +423,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
delete[] h_output_ids_;
......@@ -540,7 +527,8 @@ public:
subteardown();
}
void runConsistencyTest(RepetitionPenaltyTestCase param) {
void runConsistencyTest(RepetitionPenaltyTestCase param)
{
// Set up test
ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
subsetup(param);
......@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 1.0f;
......@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 0.53f;
......@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f;
......@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
}
......@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......
......@@ -12,6 +12,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......
#include <algorithm> // std::fill_n
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <algorithm> // std::fill_n
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
......@@ -14,6 +14,7 @@
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
}
}
}
......@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
}
}
......@@ -119,10 +120,10 @@ public:
}
protected:
unsigned long long seed = 0;
cudaStream_t stream;
unsigned long long seed = 0;
cudaStream_t stream;
Allocator<AllocatorType::CUDA>* allocator;
curandState_t* curand_states;
curandState_t* curand_states;
};
template<typename T>
......@@ -393,8 +394,8 @@ public:
{
this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
}
};
......@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
this->runTest({6, 4, 1, 4, 1.0f, 1});
};
TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
{
this->runTest({16, 51200, 1, 63, 1.0f, 8});
......@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
};
template<typename T>
class TopPSamplingKernelTest: public SamplingKernelTest<T> {
......@@ -473,7 +472,7 @@ public:
size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size;
size_t output_len = param.output_len;
size_t seq_len = output_len;
size_t seq_len = output_len;
float top_p = param.top_p;
......@@ -496,8 +495,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
......@@ -515,17 +514,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0;
size_t workspace_size = 0;
size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel.
invokeTopPSampling<T>(nullptr, // workspace
workspace_size,
cub_temp_storage_size,
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
(T*)nullptr, // log_probs
topp_id_vals_buf,
end_offsets,
......@@ -553,12 +552,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeTopPSampling<T>(workspace,
workspace_size,
......@@ -612,7 +606,7 @@ public:
size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size;
float top_p = param.top_p;
float top_p = param.top_p;
float* h_top_ps = new float[batch_size];
// Initialize runtime top k values.
for (size_t i = 0; i < batch_size; ++i) {
......@@ -621,7 +615,7 @@ public:
float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
size_t output_len = param.output_len;
size_t seq_len = output_len;
size_t seq_len = output_len;
// Logit values in the host of shape (batch_size x vocab_size).
T* h_logits = new T[batch_size * vocab_size];
......@@ -647,8 +641,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
......@@ -668,17 +662,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0;
size_t workspace_size = 0;
size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel.
invokeBatchTopPSampling<T>(nullptr, // workspace
workspace_size,
cub_temp_storage_size,
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
(T*)nullptr, // log_probs
topp_id_vals_buf,
end_offsets,
......@@ -709,12 +703,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeBatchTopPSampling<T>(workspace,
workspace_size,
......@@ -773,8 +762,8 @@ public:
{
this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
}
};
......@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
};
__global__
void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
{
int idx = threadIdx.x;
if (idx < batch_size) {
vals[idx] = curand(states + idx);
}
}
TEST(SamplingKernelTest, CurandBatchInitialize) {
size_t batch_size = 127;
TEST(SamplingKernelTest, CurandBatchInitialize)
{
size_t batch_size = 127;
cudaStream_t stream;
cudaStreamCreate(&stream);
curandState_t* curand_states;
check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
unsigned long long* h_random_seeds = new unsigned long long[batch_size];
const size_t period_size = 3;
const size_t period_size = 3;
for (size_t i = 0; i < batch_size; ++i) {
h_random_seeds[i] = i / period_size;
}
unsigned long long* d_random_seeds;
check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
check_cuda_error(
cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
// Initialize curand states.
invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
......@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
unsigned int* h_rand_vals = new unsigned int[batch_size];
check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
check_cuda_error(cudaMemcpyAsync(
h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(
cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(cudaStreamSynchronize(stream));
// The same seed produces the same random number.
......
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "gtest_utils.h"
......@@ -26,17 +27,24 @@ struct SamplingLayerTestParam {
size_t vocab_size;
size_t beam_width;
size_t top_k;
float top_p;
float top_p;
size_t output_len;
std::string toString() {
std::string toString()
{
return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
batch_size, vocab_size, beam_width, top_k, top_p, output_len);
batch_size,
vocab_size,
beam_width,
top_k,
top_p,
output_len);
}
};
template<typename T>
void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits.
// logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension)
......@@ -46,14 +54,15 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
sum += expf((float)logits[bidx * vocab_size + i]);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
int idx = bidx * vocab_size + i;
probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON));
}
}
}
template<typename T>
void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits.
// logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension)
......@@ -63,7 +72,7 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
sum += expf(logits[bidx * vocab_size + i]);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
int idx = bidx * vocab_size + i;
logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON));
}
}
......@@ -72,44 +81,45 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
template<typename T>
class SamplingDecodeTest: public testing::Test {
protected:
unsigned long long seed = 0;
const static unsigned long long max_seed = 30;
const size_t batch_size = 6;
const size_t beam_width = 1;
const size_t batchxbeam = batch_size * beam_width;
const size_t vocab_size = 8;
const size_t max_input_len = 0; // has no effect.
const size_t max_output_len = 3;
const size_t max_seq_len = max_input_len + max_output_len;
const int end_id = vocab_size - 1;
const DataType data_type = getTensorType<T>();
unsigned long long seed = 0;
const static unsigned long long max_seed = 30;
const size_t batch_size = 6;
const size_t beam_width = 1;
const size_t batchxbeam = batch_size * beam_width;
const size_t vocab_size = 8;
const size_t max_input_len = 0; // has no effect.
const size_t max_output_len = 3;
const size_t max_seq_len = max_input_len + max_output_len;
const int end_id = vocab_size - 1;
const DataType data_type = getTensorType<T>();
// vocab size 8 & length 3
T* test_input_logits;
cudaStream_t stream;
cudaStream_t stream;
ft::Allocator<ft::AllocatorType::CUDA>* allocator;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
std::mutex *cublas_wrapper_mutex;
cublasMMWrapper *cublas_wrapper;
DynamicDecodeLayer<T> *dynamic_decode_layer;
int* h_output_ids;
T* h_logits;
T* h_probs;
T* h_log_probs;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper;
DynamicDecodeLayer<T>* dynamic_decode_layer;
int* h_output_ids;
T* h_logits;
T* h_probs;
T* h_log_probs;
float* h_cum_log_probs;
float* h_output_log_probs;
T* d_logits;
int* d_input_lengths;
T* d_logits;
int* d_input_lengths;
float* d_cum_log_probs;
float* d_output_log_probs;
int* d_output_ids;
int* d_end_ids;
int* d_output_ids;
int* d_end_ids;
void setup(unsigned long long seed = 0) {
void setup(unsigned long long seed = 0)
{
this->seed = seed;
check_cuda_error(cudaStreamCreate(&stream));
......@@ -124,12 +134,8 @@ protected:
cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle,
cublaslt_handle,
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
allocator);
cublas_wrapper = new cublasMMWrapper(
cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
......@@ -140,26 +146,26 @@ protected:
false, // is_free_buffer_after_forward
&prop); // cuda_device_prop
h_output_ids = new int[batchxbeam];
h_logits = new T[batchxbeam * vocab_size];
h_probs = new T[batchxbeam * vocab_size];
h_log_probs = new T[batchxbeam * vocab_size];
h_cum_log_probs = new float[batchxbeam];
h_output_ids = new int[batchxbeam];
h_logits = new T[batchxbeam * vocab_size];
h_probs = new T[batchxbeam * vocab_size];
h_log_probs = new T[batchxbeam * vocab_size];
h_cum_log_probs = new float[batchxbeam];
h_output_log_probs = new float[max_output_len * batchxbeam];
// prob = (0.4, 0.3, 0.2, 0.1, ...)
test_input_logits = new T[24]{
-0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, // step 0
-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1
-FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX // step 2
-0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, // step 0
-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1
-FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX // step 2
};
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
// Init by zero.
cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam);
......@@ -168,7 +174,8 @@ protected:
deviceFill(d_end_ids, batchxbeam, end_id, stream);
}
void teardown() {
void teardown()
{
delete[] test_input_logits;
delete[] h_output_ids;
delete[] h_logits;
......@@ -185,12 +192,8 @@ protected:
check_cuda_error(cudaStreamDestroy(stream));
}
TensorMap* createInputTensors(int* topk,
size_t topk_size,
float* topp,
size_t topp_size,
float* temperature,
float* repetition_penalty)
TensorMap* createInputTensors(
int* topk, size_t topk_size, float* topp, size_t topp_size, float* temperature, float* repetition_penalty)
{
// construct common input tensors
TensorMap* input_tensors = new TensorMap();
......@@ -206,16 +209,19 @@ protected:
if (repetition_penalty != nullptr) {
input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
}
input_tensors->insert({"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
input_tensors->insert(
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
input_tensors->insert({"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
input_tensors->insert(
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
return input_tensors;
}
TensorMap* createOutputTensors() {
TensorMap* createOutputTensors()
{
// construct common output tensors
TensorMap* output_tensors = new TensorMap();
output_tensors->insert(
......@@ -225,26 +231,27 @@ protected:
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}});
output_tensors->insert(
{"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
output_tensors->insert(
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
output_tensors->insert({"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
return output_tensors;
}
void batchH2Dcpy(T* dst, T* src, size_t m, size_t n) {
void batchH2Dcpy(T* dst, T* src, size_t m, size_t n)
{
for (size_t i = 0; i < m; ++i) {
cudaH2Dcpy(dst + i * n, src, n);
}
}
bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids) {
bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids)
{
assert(expected_ids.size() == max_seq_len * batchxbeam);
int* h_output_ids = new int[max_seq_len * batchxbeam];
cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
int failures = 0;
for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) {
size_t s = i / batchxbeam;
size_t b = i % batchxbeam;
size_t s = i / batchxbeam;
size_t b = i % batchxbeam;
std::set<int> expts = expected_ids.at(i);
if (expts.count(h_output_ids[i]) == 0) {
if (failures < 10) {
......@@ -260,29 +267,29 @@ protected:
++failures;
}
}
TM_LOG_DEBUG("check...%6s : failures: %d / %d",
failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
TM_LOG_DEBUG(
"check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
delete[] h_output_ids;
return failures == 0;
}
public:
void runTest(std::vector<std::set<int>> expected_output_ids,
int* top_ks,
size_t top_k_size,
float* top_ps,
size_t top_p_size,
float* temperature,
float* repetition_penalty,
bool use_local_batch = false)
int* top_ks,
size_t top_k_size,
float* top_ps,
size_t top_p_size,
float* temperature,
float* repetition_penalty,
bool use_local_batch = false)
{
size_t local_batch_size = use_local_batch ? batch_size / 3 : batch_size;
uint ite = use_local_batch ? 1 : 0;
uint ite = use_local_batch ? 1 : 0;
for (unsigned long long seed = 0; seed < max_seed; ++seed) {
this->setup(seed);
size_t step = max_input_len;
TensorMap* input_tensors = createInputTensors(
top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
size_t step = max_input_len;
TensorMap* input_tensors =
createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
......@@ -316,27 +323,57 @@ TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes);
TYPED_TEST(SamplingDecodeTest, TopK)
{
int top_k = 2;
std::vector<std::set<int>> expected_output_ids {
int top_k = 2;
std::vector<std::set<int>> expected_output_ids{
// batch
// 0 1 2 3 4 5
{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, // step 0
{4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, // step 1
{2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3} // step 2
{0, 1},
{0, 1},
{0, 1},
{0, 1},
{0, 1},
{0, 1}, // step 0
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5}, // step 1
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, BatchTopK)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
std::vector<std::set<int>> expected_output_ids{
// batch
// 0 1 2 3 4 5
{0, 1}, {0}, {0}, {0, 1}, {0}, {0}, // step 0
{4, 5}, {4}, {4}, {4, 5}, {4}, {4}, // step 1
{2, 3}, {2}, {2}, {2, 3}, {2}, {2} // step 2
{0, 1},
{0},
{0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2},
{2},
{2, 3},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks;
......@@ -344,52 +381,112 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK)
TYPED_TEST(SamplingDecodeTest, TopP)
{
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, BatchTopP)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids {
{0}, {0, 1}, {0, 1}, {0}, {0, 1}, {0, 1}, // step 0
{4}, {4, 5}, {4, 5}, {4}, {4, 5}, {4, 5}, // step 1
{2}, {2, 3}, {2, 3}, {2}, {2, 3}, {2, 3} // step 2
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids{
{0},
{0, 1},
{0, 1},
{0},
{0, 1},
{0, 1}, // step 0
{4},
{4, 5},
{4, 5},
{4},
{4, 5},
{4, 5}, // step 1
{2},
{2, 3},
{2, 3},
{2},
{2, 3},
{2, 3} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, TopKTopP) {
int top_k = 2;
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, TopKTopP)
{
int top_k = 2;
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
......@@ -397,29 +494,59 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
TYPED_TEST(SamplingDecodeTest, TopKBatchTopP)
{
size_t batch_size = this->batch_size;
int top_k = 2;
float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int top_k = 2;
float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2
{0, 1},
{0},
{0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2
{0, 1},
{0},
{0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks;
......@@ -428,162 +555,351 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK)
{
size_t batch_size = this->batch_size;
int top_k = 0;
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int top_k = 0;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP)
{
size_t batch_size = this->batch_size;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP)
{
size_t batch_size = this->batch_size;
int top_k = 0;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
size_t batch_size = this->batch_size;
int top_k = 0;
float top_p = 0;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP) {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
float top_p = 0;
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
float top_p = 0;
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP) {
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP)
{
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0
{4}, {4}, {4}, {4}, {4}, {4}, // step 1
{2}, {2}, {2}, {2}, {2}, {2} // step 2
{0},
{0},
{0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero) {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0}, {0}, {0}, {0, 1}, {0}, // step 0
{4, 5}, {4}, {4}, {4}, {4, 5}, {4}, // step 1
{2, 3}, {2}, {2}, {2}, {2, 3}, {2} // step 2
{0, 1},
{0},
{0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2},
{2},
{2},
{2, 3},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero) {
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0, 1}, {0}, {0, 1}, {0}, {0}, // step 0
{4, 5}, {4, 5}, {4}, {4, 5}, {4}, {4}, // step 1
{2, 3}, {2, 3}, {2}, {2, 3}, {2}, {2} // step 2
{0, 1},
{0, 1},
{0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2, 3},
{2},
{2} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero) {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
float top_p = 0.0;
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
float top_p = 0.0;
std::vector<std::set<int>> expected_output_ids{
// batch
{0, 1}, {0, 1}, {0}, {0}, {0, 1}, {0}, // step 0
{4, 5}, {4, 5}, {4}, {4}, {4, 5}, {4}, // step 1
{2, 3}, {2, 3}, {2}, {2}, {2, 3}, {2} // step 2
{0, 1},
{0, 1},
{0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2},
{2, 3},
{2} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero) {
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
int top_k = 0;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0, 1}, {0}, {0}, {0, 1}, // step 0
{4}, {4}, {4, 5}, {4}, {4}, {4, 5}, // step 1
{2}, {2}, {2, 3}, {2}, {2}, {2, 3} // step 2
{0},
{0},
{0, 1},
{0},
{0},
{0, 1}, // step 0
{4},
{4},
{4, 5},
{4},
{4},
{4, 5}, // step 1
{2},
{2},
{2, 3},
{2},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero) {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0}, {0, 1}, {0}, {0, 1}, // step 0
{4}, {4}, {4}, {4, 5}, {4}, {4, 5}, // step 1
{2}, {2}, {2}, {2, 3}, {2}, {2, 3} // step 2
{0},
{0},
{0},
{0, 1},
{0},
{0, 1}, // step 0
{4},
{4},
{4},
{4, 5},
{4},
{4, 5}, // step 1
{2},
{2},
{2},
{2, 3},
{2},
{2, 3} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks;
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP) {
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids {
{0}, {0}, {0, 1}, {0}, {0}, {0}, // step 0
{0}, {0}, {4, 5}, {4}, {0}, {0}, // step 1
{0}, {0}, {2, 3}, {2}, {0}, {0} // step 2
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP)
{
size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids{
{0},
{0},
{0, 1},
{0},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2},
{0},
{0} // step 2
};
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ps;
}
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP) {
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids {
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP)
{
size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch
{0}, {0}, {0, 1}, {0, 1}, {0}, {0}, // step 0
{0}, {0}, {4, 5}, {4, 5}, {0}, {0}, // step 1
{0}, {0}, {2, 3}, {2, 3}, {0}, {0} // step 2
{0},
{0},
{0, 1},
{0, 1},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4, 5},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2, 3},
{0},
{0} // step 2
};
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ks;
......@@ -601,15 +917,10 @@ public:
check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream));
cublas_algo_map = new cublasAlgoMap("");
cublas_algo_map = new cublasAlgoMap("");
cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle,
cublaslt_handle,
stream,
cublas_algo_map,
cublas_wrapper_mutex,
allocator);
cublas_wrapper = new cublasMMWrapper(
cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, allocator);
}
void TearDown() override
{
......@@ -626,12 +937,11 @@ protected:
using FtTestBase::allocator;
struct cudaDeviceProp prop;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
cublasAlgoMap* cublas_algo_map;
std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper;
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
cublasAlgoMap* cublas_algo_map;
std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper;
DataType data_type = getTensorType<T>();
......@@ -643,50 +953,50 @@ protected:
size_t max_output_len;
size_t max_seq_len;
uint top_k;
uint top_k;
float top_p;
float temperature;
float repetition_penalty;
int end_id;
int end_id;
T* h_logits;
T* h_probs;
T* h_log_probs;
T* h_logits;
T* h_probs;
T* h_log_probs;
float* h_cum_log_probs;
float* h_output_log_probs;
int* h_output_ids;
int* h_output_ids;
T* d_logits;
int* d_input_lengths;
T* d_logits;
int* d_input_lengths;
float* d_cum_log_probs;
float* d_output_log_probs;
int* d_output_ids;
int* d_end_ids;
int* d_output_ids;
int* d_end_ids;
void setup(SamplingLayerTestParam param)
{
batch_size = param.batch_size;
beam_width = param.beam_width;
batchxbeam = batch_size * param.beam_width;
vocab_size = param.vocab_size;
max_input_len = 0;
batch_size = param.batch_size;
beam_width = param.beam_width;
batchxbeam = batch_size * param.beam_width;
vocab_size = param.vocab_size;
max_input_len = 0;
max_output_len = param.output_len;
max_seq_len = max_input_len + max_output_len;
max_seq_len = max_input_len + max_output_len;
top_k = param.top_k;
top_p = param.top_p;
// use default values having no effect.
temperature = 1.0f;
temperature = 1.0f;
repetition_penalty = 1.0f;
end_id = 0;
end_id = 0;
h_logits = new T[batchxbeam * vocab_size];
h_logits = new T[batchxbeam * vocab_size];
h_output_ids = new int[batchxbeam];
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
// Init by zero.
deviceFill(d_input_lengths, batchxbeam, 0, stream);
......@@ -694,14 +1004,13 @@ protected:
deviceFill(d_end_ids, batch_size, end_id);
}
void teardown() {
void teardown()
{
delete[] h_logits;
delete[] h_output_ids;
}
void runCurandTest(SamplingLayerTestParam param,
bool use_local_batch,
bool use_single_random_seed)
void runCurandTest(SamplingLayerTestParam param, bool use_local_batch, bool use_single_random_seed)
{
setup(param);
const DataType data_type = getTensorType<T>();
......@@ -709,7 +1018,7 @@ protected:
const size_t local_batch_size = use_local_batch ? 3 : batch_size;
assert(batch_size % local_batch_size == 0);
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
end_id,
stream,
......@@ -719,9 +1028,9 @@ protected:
&prop); // cuda_device_prop
// Prepare decoding arguments
const size_t random_seed_size = use_single_random_seed ? 1 : batch_size;
const size_t period_size = 3;
unsigned long long* random_seed = new unsigned long long[random_seed_size];
const size_t random_seed_size = use_single_random_seed ? 1 : batch_size;
const size_t period_size = 3;
unsigned long long* random_seed = new unsigned long long[random_seed_size];
for (size_t i = 0; i < random_seed_size; ++i) {
random_seed[i] = i / period_size;
}
......@@ -739,29 +1048,27 @@ protected:
cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size);
for (uint ite = 0; ite < iteration_num; ++ite) {
TensorMap dynamic_decode_input_tensors({
{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}
});
TensorMap dynamic_decode_input_tensors(
{{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}});
// common outputs
TensorMap dynamic_decode_output_tensors({
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}
});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors);
TensorMap dynamic_decode_output_tensors(
{{"output_ids",
Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
sync_check_cuda_error();
// check results.
......@@ -774,7 +1081,11 @@ protected:
for (size_t j = 1; j < period_size; ++j) {
EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j])
<< fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d",
step, i, h_output_ids[i], i + j, h_output_ids[i + j]);
step,
i,
h_output_ids[i],
i + j,
h_output_ids[i + j]);
}
}
}
......@@ -783,11 +1094,12 @@ protected:
teardown();
}
void runCumLogProbTest(SamplingLayerTestParam param) {
void runCumLogProbTest(SamplingLayerTestParam param)
{
setup(param);
unsigned long long seed = 43;
const DataType data_type = getTensorType<T>();
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
unsigned long long seed = 43;
const DataType data_type = getTensorType<T>();
DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size,
end_id,
stream,
......@@ -798,10 +1110,10 @@ protected:
// Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1.
// T* h_logits = new T[batch_size * beam_width * vocab_size];
T* h_probs = new T[batch_size * beam_width * vocab_size];
T* h_log_probs = new T[batch_size * beam_width * vocab_size];
float* h_cum_log_probs = new float[batch_size * beam_width];
float* h_output_log_probs = new float[max_output_len * batch_size * beam_width];
T* h_probs = new T[batch_size * beam_width * vocab_size];
T* h_log_probs = new T[batch_size * beam_width * vocab_size];
float* h_cum_log_probs = new float[batch_size * beam_width];
float* h_output_log_probs = new float[max_output_len * batch_size * beam_width];
float* expected_cum_log_probs = new float[batch_size * beam_width];
initRandom(h_logits, batch_size * beam_width * vocab_size, -3.0f, 3.0f);
computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size);
......@@ -810,10 +1122,11 @@ protected:
int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
float* output_log_probs = reinterpret_cast<float*>(
allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
float* output_log_probs =
reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
int* output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
int* output_ids =
reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
int* h_output_ids = new int[batch_size * beam_width];
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
......@@ -824,65 +1137,64 @@ protected:
cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
TensorMap input_tensors({
{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
TensorMap input_tensors({{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
for (size_t step = max_input_len; step < max_output_len; ++step) {
uint ite = 0;
// Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size);
TensorMap dynamic_decode_input_tensors({
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths",
Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
TensorMap dynamic_decode_input_tensors(
{{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
// common outputs
TensorMap dynamic_decode_output_tensors({
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
{"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
TensorMap dynamic_decode_output_tensors(
{{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
{"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
&dynamic_decode_input_tensors);
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(h_output_ids,
dynamic_decode_output_tensors
.at("output_ids")
.getPtrWithOffset<int>(step * (batch_size * beam_width)),
batch_size * beam_width);
cudaD2Hcpy(
h_output_ids,
dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset<int>(step * (batch_size * beam_width)),
batch_size * beam_width);
cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx];
TM_LOG_DEBUG(
"| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step, (int)i, (int)idx, (int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
(int)step,
(int)i,
(int)idx,
(int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i],
(float)h_log_probs[idx],
h_cum_log_probs[i],
expected_cum_log_probs[i],
(float)h_probs[idx]);
}
TM_LOG_DEBUG("");
}
......@@ -898,7 +1210,6 @@ protected:
delete dynamic_decode_layer;
}
};
TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes);
......
#include <iostream>
#include <vector>
#include <unordered_map>
#include <vector>
#include <gtest/gtest.h>
......@@ -10,16 +10,17 @@ using namespace turbomind;
namespace {
#define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \
EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \
} while(false)
TEST(TensorMapTest, HasKeyCorrectness) {
bool* v1 = new bool(true);
#define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \
EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \
} while (false)
TEST(TensorMapTest, HasKeyCorrectness)
{
bool* v1 = new bool(true);
float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
......@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
delete[] v2;
}
TEST(TensorMapTest, InsertCorrectness) {
int* v1 = new int[4]{1, 10, 20, 30};
TEST(TensorMapTest, InsertCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
float* v2 = new float[2]{1.0f, 2.0f};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
......@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
EXPECT_FALSE(map.isExist("t2"));
}
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
{
TensorMap map;
EXPECT_TRUE(map.size() == 0);
// forbid a none tensor.
......@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
}
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TensorMap map({{"t1", t1}});
EXPECT_TRUE(map.size() == 1);
// forbid a duplicated key.
......@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
delete[] v1;
}
TEST(TensorMapTest, GetValCorrectness) {
int* v1 = new int[4]{1, 10, 20, 30};
TEST(TensorMapTest, GetValCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
TensorMap map({{"t1", t1}});
......@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
delete[] v1;
}
TEST(TensorMapTest, GetTensorCorrectness) {
bool* t1_val = new bool(true);
TEST(TensorMapTest, GetTensorCorrectness)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3};
int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
TensorMap map({{"t1", t1}, {"t2", t2}});
......@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
delete[] t1_val;
}
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
bool* t1_val = new bool(true);
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3};
int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
const TensorMap map({{"t1", t1}, {"t2", t2}});
......@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
delete[] t1_val;
}
TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
TEST(TensorTest, EmptyTensorMinMaxRaiseError)
{
Tensor t1;
EXPECT_THROW(t1.min<int>(), std::runtime_error);
EXPECT_THROW(t1.max<int>(), std::runtime_error);
......@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
EXPECT_THROW(t2.max<int>(), std::runtime_error);
}
using TensorTypes = testing::Types<int8_t, int, float>;
template <typename T>
class TensorFuncTest : public testing::Test {};
template<typename T>
class TensorFuncTest: public testing::Test {};
TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
TYPED_TEST(TensorFuncTest, MaxCorrectness) {
TYPED_TEST(TensorFuncTest, MaxCorrectness)
{
using T = TypeParam;
size_t size = 4;
T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
......@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, MinCorrectness) {
TYPED_TEST(TensorFuncTest, MinCorrectness)
{
using T = TypeParam;
size_t size = 4;
......@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, AnyCorrectness) {
TYPED_TEST(TensorFuncTest, AnyCorrectness)
{
using T = TypeParam;
T* v = new T[4]{T(1), T(2), T(3), T(4)};
T* v = new T[4]{T(1), T(2), T(3), T(4)};
Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
EXPECT_TRUE(t.any<T>(T(1)));
EXPECT_FALSE(t.any<T>(T(5)));
delete[] v;
}
TYPED_TEST(TensorFuncTest, AllCorrectness) {
TYPED_TEST(TensorFuncTest, AllCorrectness)
{
using T = TypeParam;
constexpr size_t size = 4;
T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
EXPECT_TRUE(t1.all<T>(T(1)));
EXPECT_FALSE(t2.all<T>(T(2)));
delete[] v1;
delete[] v2;
}
TYPED_TEST(TensorFuncTest, SliceCorrectness) {
TYPED_TEST(TensorFuncTest, SliceCorrectness)
{
using T = TypeParam;
constexpr int size = 12;
T* v = new T[size];
T* v = new T[size];
for (int i = 0; i < size; ++i) {
v[i] = i;
}
DataType dtype = getTensorType<T>();
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4);
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4);
EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
// An overflowed tensor throws an exception.
......@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
delete[] v;
}
} // end of namespace
} // end of namespace
......@@ -16,15 +16,15 @@
#pragma once
#include <algorithm> // min, max
#include <assert.h> // assert
#include <float.h> // FLT_MAX
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <limits> // numeric_limits
#include <stdlib.h> // rand
#include <string> // string
#include <vector> // vector
#include <algorithm> // min, max
#include <assert.h> // assert
#include <float.h> // FLT_MAX
#include <iostream> // snprintf
#include <limits> // numeric_limits
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // string
#include <vector> // vector
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
......@@ -36,32 +36,37 @@
using namespace turbomind;
class TestFailureError : public std::exception {
class TestFailureError: public std::exception {
private:
std::string msg_;
public:
explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") {
explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
}
const char* what () const throw () {
const char* what() const throw()
{
return msg_.c_str();
}
};
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_TRUE(cond) \
do { \
if (!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
#define EXPECT_FALSE(cond) \
do { \
if (cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
{
......@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
}
template<typename T>
bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
size_t failures = 0;
float relative_gap = 0.0f;;
bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
{
size_t failures = 0;
float relative_gap = 0.0f;
;
for (size_t i = 0; i < size; ++i) {
// The values for the output and the reference.
......@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size);
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
100. * failures / size, atol, rtol, 100. * relative_gap);
failures <= tol_failures ? "....OK" : "FAILED",
name.c_str(),
100. * failures / size,
atol,
rtol,
100. * relative_gap);
return failures <= tol_failures;
}
template<typename T>
bool checkResult(std::string name, T* out, T* ref, size_t size,
bool device_out = true, bool device_ref = false)
bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
{
bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f;
bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f;
T* h_out = nullptr;
if (device_out) {
......@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
ref = h_ref;
}
bool is_ok = checkResult(name, out, ref, size, atol, rtol);
if (h_out != nullptr){
if (h_out != nullptr) {
delete[] h_out;
}
if (h_ref != nullptr) {
......@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
}
template<typename T>
void initRandom(T* ptr, size_t size, float minval, float maxval) {
void initRandom(T* ptr, size_t size, float minval, float maxval)
{
for (size_t i = 0; i < size; ++i) {
float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
val *= (maxval - minval);
......@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
}
}
void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
void initRandomInt(int* ptr, size_t size, int minval, int maxval)
{
assert(minval < maxval);
int mod = maxval - minval;
for (size_t i = 0; i < size; ++i) {
......@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
}
template<typename T>
void tile(T* x, int m, int n) {
void tile(T* x, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
x[i * n + j] = x[j];
......@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
}
template<typename T>
void tile(T* dst, T* src, int m, int n) {
void tile(T* dst, T* src, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
dst[i * n + j] = src[j];
......@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
#define HALF_FLT_MAX 65504.0f
template<typename T>
bool isHalf() {
bool isHalf()
{
return std::is_same<T, half>::value;
}
template<typename T>
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment