Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -24,6 +24,12 @@
namespace turbomind {
// cub.cuh brings windows.h
// should be included after cub.cuh
#ifdef ERROR
#undef ERROR
#endif
class Logger {
public:
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h"
......@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
}
// get slices
ConcateSlice slice0{.slices = {{0, dim0}}};
ConcateSlice slice1{.slices = {{0, dim1}}};
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0];
}
......
......@@ -15,6 +15,7 @@
*/
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/macro.h"
#include <atomic>
namespace turbomind {
......
......@@ -18,7 +18,7 @@
#include "nvtx_utils.h"
#ifdef USE_NVTX
#include "nvToolsExt.h"
#include "nvtx3/nvToolsExt.h"
#endif
namespace ft_nvtx {
......
......@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
const T* scales_ptr = get_ptr<const T>(scales);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor);
T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor);
cudaEvent_t start, stop;
......@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* bias_ptr = get_ptr<const T>(bias);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor);
T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor);
fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
......
......@@ -14,11 +14,11 @@
* limitations under the License.
*/
#include <chrono>
#include <cstdlib>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <chrono>
#include "torch/csrc/cuda/Stream.h"
#include <torch/custom_class.h>
......@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
namespace ft = turbomind;
template<typename T>
void int8_gemm_test(
const int m,
const int n,
const int k,
const at::ScalarType output_data_type,
const QuantMode quant_mode,
const int iters)
void int8_gemm_test(const int m,
const int n,
const int k,
const at::ScalarType output_data_type,
const QuantMode quant_mode,
const int iters)
{
const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
|| quant_mode == QuantMode::PerChannelQuant;
const bool per_token_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant =
quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
const int row_scale_size = per_token_quant ? m : 1;
const int col_scale_size = per_channel_quant ? n : 1;
......@@ -76,16 +75,16 @@ void int8_gemm_test(
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
* torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
......@@ -101,40 +100,41 @@ void int8_gemm_test(
auto stream = at::cuda::getCurrentCUDAStream().stream();
// warm_up
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
"y_gpu_int32.npy");
ft::check_cuda_error(cudaStreamSynchronize(stream));
auto start = high_resolution_clock::now();
for (int i = 0; i < iters; ++i) {
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
get_ptr<int8_t>(w_T_gpu),
quant_mode,
get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu),
m,
n,
k,
nullptr,
0,
stream);
}
ft::check_cuda_error(cudaStreamSynchronize(stream));
......@@ -142,27 +142,30 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
if (torch::allclose(
(y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else {
}
else {
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
// alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
}
}
int main(int argc, char **argv)
int main(int argc, char** argv)
{
if (argc != 7) {
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
TM_LOG_ERROR(
"arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0;
}
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
at::ScalarType::Half : at::ScalarType::BFloat16;
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
const int m = atoi(argv[1]);
const int n = atoi(argv[2]);
const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
if (quant_mode == QuantMode::PerChannelQuant) {
printf("per channel quant \n");
}
......@@ -170,7 +173,8 @@ int main(int argc, char **argv)
if (output_data_type == at::ScalarType::Half) {
int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
} else {
}
else {
#if ENABLE_BF16
int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
#endif
......
......@@ -20,7 +20,12 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1
)
add_definitions(-DTORCH_CUDA=1)
find_package(CUDAToolkit REQUIRED)
if (NOT MSVC)
add_definitions(-DTORCH_CUDA=1)
endif()
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
......@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
# Sorted by alphabetical order of test name.
target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC
-lcudart -lcurand
CUDA::cudart CUDA::curand
gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC
-lcudart
CUDA::cudart
logprob_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC
-lcudart
CUDA::cudart
sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
target_link_libraries( # Libs for test_sampling_layer
unittest PUBLIC
-lcublas -lcublasLt -lcudart
CUDA::cublas CUDA::cublasLt CUDA::cudart
cublasMMWrapper memory_utils
DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
target_link_libraries( # Libs for test_tensor
......@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor
remove_definitions(-DTORCH_CUDA=1)
add_executable(test_gemm test_gemm.cu)
target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC
......@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart
Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
unfused_attention_kernels
memory_utils tensor cublasMMWrapper cuda_utils logger)
......@@ -14,13 +14,12 @@
* limitations under the License.
*/
#include "gtest_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "gtest_utils.h"
#include <curand.h>
#include <sstream>
......
......@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
// compute actual
using AttentionOp = FlashAttentionOp<scalar_t>;
using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = seq_len * size_per_head};
Layout layout_k{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_v{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_o{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = num_heads * size_per_head,
.stride_head = size_per_head,
.use_seqlens = true};
Layout layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
Layout layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
typename AttentionOp::Params attn_params{.attn_out = actual_out_ptr,
.query = query_ptr,
.key = key_ptr,
.val = val_ptr,
.mask = mask_ptr,
.out_accum = accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr,
.cu_seqlens_k = nullptr,
.group_size = 1,
.layout_q = layout_q,
.layout_k = layout_k,
.layout_v = layout_v,
.layout_o = layout_o};
typename AttentionOp::Params attn_params{actual_out_ptr,
query_ptr,
key_ptr,
val_ptr,
mask_ptr,
accum_buf_ptr,
cu_seqlens_ptr,
nullptr,
1,
layout_q,
layout_k,
layout_v,
layout_o};
flash_attention(attn_params, stream);
sync_check_cuda_error();
......
This diff is collapsed.
......@@ -5,10 +5,10 @@
#include <string>
#include <vector>
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm>
#include <iostream>
......@@ -39,13 +39,14 @@ protected:
void testTransposition();
};
void fill_tensor_random(Tensor a) {
const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
void fill_tensor_random(Tensor a)
{
const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
std::uniform_int_distribution<int8_t> int8_random(-128, 127);
std::mt19937 rng(0);
std::mt19937 rng(0);
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
}
......@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
int8_t *a_data, *a_t_data;
cudaMalloc(&a_data, m * k * sizeof(int8_t));
Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
fill_tensor_random(a);
cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
std::vector<int8_t> a_t_host_ref(a_t.size());
reference_transpose_host(a_t_host_ref, a);
......
#include <assert.h>
#include <math.h>
#include <float.h>
#include <math.h>
#include <stdexcept>
#include <tuple>
#include <vector>
#ifdef __linux__
#include <sys/time.h>
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
size_t vocab_size;
size_t beam_width;
std::string toString() {
std::string toString()
{
return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
max_input_length, batch_size, vocab_size, beam_width);
max_input_length,
batch_size,
vocab_size,
beam_width);
}
};
/////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T>
class LogProbKernelTest : public FtTestBase {
class LogProbKernelTest: public FtTestBase {
protected:
void computeCumLogProbs(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
void computeCumLogProbs(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
......@@ -54,9 +59,9 @@ protected:
cum_log_probs[i] = 0.0f;
}
else if ((int)step < input_lengths[i]) {
size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX;
size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]);
if (val > max_logits) {
......@@ -67,7 +72,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits);
}
int token_id = input_ids[step * batch_size + i];
int token_id = input_ids[step * batch_size + i];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) {
log_probs[step * batch_size + i] = log_prob;
......@@ -78,11 +83,11 @@ protected:
}
}
void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs,
const T* logits,
const int* input_ids,
const int* input_lengths,
const size_t max_input_length,
const size_t batch_size,
const size_t vocab_size,
......@@ -98,8 +103,8 @@ protected:
cum_log_probs[i] = 0.0f;
}
else if ((int)step < input_lengths[i]) {
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX;
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]);
if (val > max_logits) {
......@@ -110,7 +115,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits);
}
int token_id = input_ids[i * max_input_length + step];
int token_id = input_ids[i * max_input_length + step];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) {
log_probs[i * max_input_length + step] = log_prob;
......@@ -122,17 +127,17 @@ protected:
}
public:
void runTest(LogProbKernelTestParam param) {
void runTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam];
T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam];
// output buffers
......@@ -145,9 +150,9 @@ public:
memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
// device buffers
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -157,7 +162,7 @@ public:
deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size);
void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs,
d_logits,
d_input_ids,
......@@ -189,16 +194,17 @@ public:
delete[] h_logits;
}
void runBatchFirstTest(LogProbKernelTestParam param) {
void runBatchFirstTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam];
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam];
// output buffers
......@@ -213,8 +219,8 @@ public:
// device buffers
T* d_logits =
reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers
......@@ -224,7 +230,7 @@ public:
check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size);
void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs,
d_logits,
d_input_ids,
......@@ -239,16 +245,16 @@ public:
true);
computeCumLogProbsBatchFirst(expected_cum_log_probs,
nullptr,
h_logits,
h_input_ids,
h_input_lengths,
max_input_length,
batchxbeam,
vocab_size,
vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
nullptr,
h_logits,
h_input_ids,
h_input_lengths,
max_input_length,
batchxbeam,
vocab_size,
vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed);
delete[] expected_cum_log_probs;
......@@ -256,10 +262,8 @@ public:
delete[] h_input_ids;
delete[] h_logits;
}
};
TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
TYPED_TEST(LogProbKernelTest, SingleStep)
......
......@@ -14,24 +14,24 @@
* limitations under the License.
*/
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <algorithm> // std::min, std::max
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdexcept>
#include <stdlib.h> // rand
#include <string> // std::string
#include <stdlib.h> // rand
#include <string> // std::string
#include <unordered_map>
#include <vector> // std::vector
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "gtest_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
using namespace turbomind;
......@@ -41,21 +41,25 @@ struct TemperatureTestParam {
float* temperatures;
size_t temperatures_size;
std::string toString() {
std::string toString()
{
return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
batch_size,
vocab_size,
arr2str(temperatures, temperatures_size).c_str());
}
};
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
{
return (vocab_size + pad - 1) / pad * pad;
}
template<typename T>
void applyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float repetition_penalty,
void applyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float repetition_penalty,
const size_t step,
const size_t max_input_length,
const size_t batch_size,
......@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
int token_id = output_ids[i + t * batch_size];
if (!penalized[token_id]) {
float logit = static_cast<float>(logits[offset + token_id]);
logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
logit * repetition_penalty : logit / repetition_penalty);
logits[offset + token_id] =
static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
penalized[token_id] = true;
}
}
......@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
}
template<typename T>
void batchApplyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
void batchApplyRepetitonPenalty(T* logits,
const int* output_ids,
const int* input_lengths,
const float* repetition_penalties,
const size_t step,
const size_t max_input_length,
......@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
}
template<typename T>
void initLogitsAndBias(T* logits,
T* bias,
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded)
void initLogitsAndBias(
T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
{
initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
if (bias != nullptr) {
......@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
}
}
/////////////////////////////////// Tests //////////////////////////////////////////
template<typename T>
class TemperaturePenaltyTest : public FtTestBase {
class TemperaturePenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -157,17 +157,18 @@ protected:
float* d_temperatures_;
void subsetup(TemperatureTestParam param) {
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
void subsetup(TemperatureTestParam param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
if (param.temperatures_size > 1) {
......@@ -177,7 +178,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
}
......@@ -195,7 +197,7 @@ protected:
ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
for (size_t j = 0; j < vocab_size; ++j) {
size_t index = i * vocab_size_padded + j;
float logit = static_cast<float>(logits[index]);
float logit = static_cast<float>(logits[index]);
if (bias != nullptr) {
logit += static_cast<float>(bias[j]);
}
......@@ -204,29 +206,18 @@ protected:
}
}
public:
void runTest(TemperatureTestParam param)
{
subsetup(param);
// Do test
if (param.temperatures_size == 1) {
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
}
else {
invokeBatchApplyTemperaturePenalty(d_logits_,
d_bias_,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
invokeBatchApplyTemperaturePenalty(
d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
}
computeReference(h_logits_,
h_bias_,
......@@ -240,21 +231,17 @@ public:
subteardown();
}
void runConsistencyTest(TemperatureTestParam param) {
void runConsistencyTest(TemperatureTestParam param)
{
// Set up test
ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
subsetup(param);
// Run a single runtime value case.
invokeApplyTemperaturePenalty(d_logits_,
d_bias_,
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
float temperature = param.temperatures[0];
invokeApplyTemperaturePenalty(
d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
float temperature = param.temperatures[0];
float* h_temperatures = new float[batch_size_];
for (size_t i = 0; i < batch_size_; ++i) {
h_temperatures[i] = temperature;
......@@ -263,18 +250,14 @@ public:
cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
invokeBatchApplyTemperaturePenalty(d_logits_batch,
d_bias_batch,
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
invokeBatchApplyTemperaturePenalty(
d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
bool passed =
checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
EXPECT_TRUE(passed);
// Tear down test
......@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 1.0f;
......@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 0.53f;
......@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f;
......@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, temperatures, batch_size});
}
......@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
size_t repetition_penalties_size;
RepetitionPenaltyType repetition_penalty_type;
std::string toString() {
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
std::string toString()
{
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
{RepetitionPenaltyType::Additive, "additive"},
{RepetitionPenaltyType::Multiplicative, "multiplicative"},
{RepetitionPenaltyType::None, "none"}};
return fmtstr(
"RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"repetition_penalties=%s, repetition_penalty_type=%s]",
batch_size, vocab_size, max_input_length,
arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"repetition_penalties=%s, repetition_penalty_type=%s]",
batch_size,
vocab_size,
max_input_length,
arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
}
};
template<typename T>
class RepetitionPenaltyTest : public FtTestBase {
class RepetitionPenaltyTest: public FtTestBase {
protected:
// Set up test
size_t batch_size_;
......@@ -392,37 +377,38 @@ protected:
size_t sequence_length_;
size_t step_;
T* h_logits_;
T* h_bias_;
T* h_logits_;
T* h_bias_;
int* h_output_ids_;
int* h_input_lengths_;
T* d_logits_;
T* d_bias_;
T* d_logits_;
T* d_bias_;
int* d_output_ids_;
int* d_input_lengths_;
float* d_repetition_penalties_;
void subsetup(RepetitionPenaltyTestCase param) {
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
void subsetup(RepetitionPenaltyTestCase param)
{
batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_);
max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7;
max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7;
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_];
h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_];
h_input_lengths_ = new int[batch_size_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
......@@ -437,7 +423,8 @@ protected:
}
}
void subteardown() {
void subteardown()
{
delete[] h_logits_;
delete[] h_bias_;
delete[] h_output_ids_;
......@@ -540,7 +527,8 @@ public:
subteardown();
}
void runConsistencyTest(RepetitionPenaltyTestCase param) {
void runConsistencyTest(RepetitionPenaltyTestCase param)
{
// Set up test
ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
subsetup(param);
......@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 1.0f;
......@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 0.53f;
......@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f;
......@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
}
......@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
{
size_t batch_size = 6;
size_t batch_size = 6;
float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
}
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
}
......
......@@ -12,6 +12,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......
#include <algorithm> // std::fill_n
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <algorithm> // std::fill_n
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // std::string
#include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
......@@ -14,6 +14,7 @@
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
}
}
}
......@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
}
for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval;
logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
}
}
......@@ -119,10 +120,10 @@ public:
}
protected:
unsigned long long seed = 0;
cudaStream_t stream;
unsigned long long seed = 0;
cudaStream_t stream;
Allocator<AllocatorType::CUDA>* allocator;
curandState_t* curand_states;
curandState_t* curand_states;
};
template<typename T>
......@@ -393,8 +394,8 @@ public:
{
this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
}
};
......@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
this->runTest({6, 4, 1, 4, 1.0f, 1});
};
TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
{
this->runTest({16, 51200, 1, 63, 1.0f, 8});
......@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
};
template<typename T>
class TopPSamplingKernelTest: public SamplingKernelTest<T> {
......@@ -473,7 +472,7 @@ public:
size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size;
size_t output_len = param.output_len;
size_t seq_len = output_len;
size_t seq_len = output_len;
float top_p = param.top_p;
......@@ -496,8 +495,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
......@@ -515,17 +514,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0;
size_t workspace_size = 0;
size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel.
invokeTopPSampling<T>(nullptr, // workspace
workspace_size,
cub_temp_storage_size,
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
(T*)nullptr, // log_probs
topp_id_vals_buf,
end_offsets,
......@@ -553,12 +552,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeTopPSampling<T>(workspace,
workspace_size,
......@@ -612,7 +606,7 @@ public:
size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size;
float top_p = param.top_p;
float top_p = param.top_p;
float* h_top_ps = new float[batch_size];
// Initialize runtime top k values.
for (size_t i = 0; i < batch_size; ++i) {
......@@ -621,7 +615,7 @@ public:
float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
size_t output_len = param.output_len;
size_t seq_len = output_len;
size_t seq_len = output_len;
// Logit values in the host of shape (batch_size x vocab_size).
T* h_logits = new T[batch_size * vocab_size];
......@@ -647,8 +641,8 @@ public:
struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>(
allocator->malloc(sizeof(curandState_t) * batch_size, false));
curandState_t* curand_states =
reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream);
float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
......@@ -668,17 +662,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0;
size_t workspace_size = 0;
size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel.
invokeBatchTopPSampling<T>(nullptr, // workspace
workspace_size,
cub_temp_storage_size,
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
(T*)nullptr, // log_probs
topp_id_vals_buf,
end_offsets,
......@@ -709,12 +703,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf,
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
invokeBatchTopPSampling<T>(workspace,
workspace_size,
......@@ -773,8 +762,8 @@ public:
{
this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true);
}
};
......@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
};
__global__
void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
{
int idx = threadIdx.x;
if (idx < batch_size) {
vals[idx] = curand(states + idx);
}
}
TEST(SamplingKernelTest, CurandBatchInitialize) {
size_t batch_size = 127;
TEST(SamplingKernelTest, CurandBatchInitialize)
{
size_t batch_size = 127;
cudaStream_t stream;
cudaStreamCreate(&stream);
curandState_t* curand_states;
check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
unsigned long long* h_random_seeds = new unsigned long long[batch_size];
const size_t period_size = 3;
const size_t period_size = 3;
for (size_t i = 0; i < batch_size; ++i) {
h_random_seeds[i] = i / period_size;
}
unsigned long long* d_random_seeds;
check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
check_cuda_error(
cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
// Initialize curand states.
invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
......@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
unsigned int* h_rand_vals = new unsigned int[batch_size];
check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
check_cuda_error(cudaMemcpyAsync(
h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(
cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(cudaStreamSynchronize(stream));
// The same seed produces the same random number.
......
This diff is collapsed.
#include <iostream>
#include <vector>
#include <unordered_map>
#include <vector>
#include <gtest/gtest.h>
......@@ -10,16 +10,17 @@ using namespace turbomind;
namespace {
#define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \
EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \
} while(false)
TEST(TensorMapTest, HasKeyCorrectness) {
bool* v1 = new bool(true);
#define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \
EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \
} while (false)
TEST(TensorMapTest, HasKeyCorrectness)
{
bool* v1 = new bool(true);
float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
......@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
delete[] v2;
}
TEST(TensorMapTest, InsertCorrectness) {
int* v1 = new int[4]{1, 10, 20, 30};
TEST(TensorMapTest, InsertCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
float* v2 = new float[2]{1.0f, 2.0f};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
......@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
EXPECT_FALSE(map.isExist("t2"));
}
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
{
TensorMap map;
EXPECT_TRUE(map.size() == 0);
// forbid a none tensor.
......@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
}
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TensorMap map({{"t1", t1}});
EXPECT_TRUE(map.size() == 1);
// forbid a duplicated key.
......@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
delete[] v1;
}
TEST(TensorMapTest, GetValCorrectness) {
int* v1 = new int[4]{1, 10, 20, 30};
TEST(TensorMapTest, GetValCorrectness)
{
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
TensorMap map({{"t1", t1}});
......@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
delete[] v1;
}
TEST(TensorMapTest, GetTensorCorrectness) {
bool* t1_val = new bool(true);
TEST(TensorMapTest, GetTensorCorrectness)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3};
int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
TensorMap map({{"t1", t1}, {"t2", t2}});
......@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
delete[] t1_val;
}
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
bool* t1_val = new bool(true);
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
{
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3};
int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
const TensorMap map({{"t1", t1}, {"t2", t2}});
......@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
delete[] t1_val;
}
TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
TEST(TensorTest, EmptyTensorMinMaxRaiseError)
{
Tensor t1;
EXPECT_THROW(t1.min<int>(), std::runtime_error);
EXPECT_THROW(t1.max<int>(), std::runtime_error);
......@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
EXPECT_THROW(t2.max<int>(), std::runtime_error);
}
using TensorTypes = testing::Types<int8_t, int, float>;
template <typename T>
class TensorFuncTest : public testing::Test {};
template<typename T>
class TensorFuncTest: public testing::Test {};
TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
TYPED_TEST(TensorFuncTest, MaxCorrectness) {
TYPED_TEST(TensorFuncTest, MaxCorrectness)
{
using T = TypeParam;
size_t size = 4;
T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
......@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, MinCorrectness) {
TYPED_TEST(TensorFuncTest, MinCorrectness)
{
using T = TypeParam;
size_t size = 4;
......@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
delete[] v3;
}
TYPED_TEST(TensorFuncTest, AnyCorrectness) {
TYPED_TEST(TensorFuncTest, AnyCorrectness)
{
using T = TypeParam;
T* v = new T[4]{T(1), T(2), T(3), T(4)};
T* v = new T[4]{T(1), T(2), T(3), T(4)};
Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
EXPECT_TRUE(t.any<T>(T(1)));
EXPECT_FALSE(t.any<T>(T(5)));
delete[] v;
}
TYPED_TEST(TensorFuncTest, AllCorrectness) {
TYPED_TEST(TensorFuncTest, AllCorrectness)
{
using T = TypeParam;
constexpr size_t size = 4;
T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
EXPECT_TRUE(t1.all<T>(T(1)));
EXPECT_FALSE(t2.all<T>(T(2)));
delete[] v1;
delete[] v2;
}
TYPED_TEST(TensorFuncTest, SliceCorrectness) {
TYPED_TEST(TensorFuncTest, SliceCorrectness)
{
using T = TypeParam;
constexpr int size = 12;
T* v = new T[size];
T* v = new T[size];
for (int i = 0; i < size; ++i) {
v[i] = i;
}
DataType dtype = getTensorType<T>();
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4);
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4);
EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
// An overflowed tensor throws an exception.
......@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
delete[] v;
}
} // end of namespace
} // end of namespace
......@@ -16,15 +16,15 @@
#pragma once
#include <algorithm> // min, max
#include <assert.h> // assert
#include <float.h> // FLT_MAX
#include <iostream> // snprintf
#include <math.h> // expf, log
#include <limits> // numeric_limits
#include <stdlib.h> // rand
#include <string> // string
#include <vector> // vector
#include <algorithm> // min, max
#include <assert.h> // assert
#include <float.h> // FLT_MAX
#include <iostream> // snprintf
#include <limits> // numeric_limits
#include <math.h> // expf, log
#include <stdlib.h> // rand
#include <string> // string
#include <vector> // vector
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
......@@ -36,32 +36,37 @@
using namespace turbomind;
class TestFailureError : public std::exception {
class TestFailureError: public std::exception {
private:
std::string msg_;
public:
explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") {
explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
}
const char* what () const throw () {
const char* what() const throw()
{
return msg_.c_str();
}
};
#define EXPECT_TRUE(cond) \
do { if(!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \
__func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} } while(false)
#define EXPECT_TRUE(cond) \
do { \
if (!(cond)) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
#define EXPECT_FALSE(cond) \
do { \
if (cond) { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \
} \
} while (false)
bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
{
......@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
}
template<typename T>
bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
size_t failures = 0;
float relative_gap = 0.0f;;
bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
{
size_t failures = 0;
float relative_gap = 0.0f;
;
for (size_t i = 0; i < size; ++i) {
// The values for the output and the reference.
......@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size);
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
100. * failures / size, atol, rtol, 100. * relative_gap);
failures <= tol_failures ? "....OK" : "FAILED",
name.c_str(),
100. * failures / size,
atol,
rtol,
100. * relative_gap);
return failures <= tol_failures;
}
template<typename T>
bool checkResult(std::string name, T* out, T* ref, size_t size,
bool device_out = true, bool device_ref = false)
bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
{
bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f;
bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f;
T* h_out = nullptr;
if (device_out) {
......@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
ref = h_ref;
}
bool is_ok = checkResult(name, out, ref, size, atol, rtol);
if (h_out != nullptr){
if (h_out != nullptr) {
delete[] h_out;
}
if (h_ref != nullptr) {
......@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
}
template<typename T>
void initRandom(T* ptr, size_t size, float minval, float maxval) {
void initRandom(T* ptr, size_t size, float minval, float maxval)
{
for (size_t i = 0; i < size; ++i) {
float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
val *= (maxval - minval);
......@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
}
}
void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
void initRandomInt(int* ptr, size_t size, int minval, int maxval)
{
assert(minval < maxval);
int mod = maxval - minval;
for (size_t i = 0; i < size; ++i) {
......@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
}
template<typename T>
void tile(T* x, int m, int n) {
void tile(T* x, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
x[i * n + j] = x[j];
......@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
}
template<typename T>
void tile(T* dst, T* src, int m, int n) {
void tile(T* dst, T* src, int m, int n)
{
for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) {
dst[i * n + j] = src[j];
......@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
#define HALF_FLT_MAX 65504.0f
template<typename T>
bool isHalf() {
bool isHalf()
{
return std::is_same<T, half>::value;
}
template<typename T>
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment