Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
...@@ -24,6 +24,12 @@ ...@@ -24,6 +24,12 @@
namespace turbomind { namespace turbomind {
// cub.cuh brings windows.h
// should be included after cub.cuh
#ifdef ERROR
#undef ERROR
#endif
class Logger { class Logger {
public: public:
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
...@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve ...@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
} }
// get slices // get slices
ConcateSlice slice0{.slices = {{0, dim0}}}; ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{.slices = {{0, dim1}}}; ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) { if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0]; slice0 = slices[0];
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/macro.h"
#include <atomic> #include <atomic>
namespace turbomind { namespace turbomind {
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "nvtx_utils.h" #include "nvtx_utils.h"
#ifdef USE_NVTX #ifdef USE_NVTX
#include "nvToolsExt.h" #include "nvtx3/nvToolsExt.h"
#endif #endif
namespace ft_nvtx { namespace ft_nvtx {
......
...@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper( ...@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
const T* scales_ptr = get_ptr<const T>(scales); const T* scales_ptr = get_ptr<const T>(scales);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor); T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor); char* ws_ptr = get_ptr<char>(ws_tensor);
cudaEvent_t start, stop; cudaEvent_t start, stop;
...@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper( ...@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* bias_ptr = get_ptr<const T>(bias); const T* bias_ptr = get_ptr<const T>(bias);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor); T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor); char* ws_ptr = get_ptr<char>(ws_tensor);
fused_gemm_dq_runner.gemm_bias_act(input_act_ptr, fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
* limitations under the License. * limitations under the License.
*/ */
#include <chrono>
#include <cstdlib>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <cstdlib>
#include <chrono>
#include "torch/csrc/cuda/Stream.h" #include "torch/csrc/cuda/Stream.h"
#include <torch/custom_class.h> #include <torch/custom_class.h>
...@@ -37,18 +37,17 @@ using torch_ext::get_ptr; ...@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
namespace ft = turbomind; namespace ft = turbomind;
template<typename T> template<typename T>
void int8_gemm_test( void int8_gemm_test(const int m,
const int m, const int n,
const int n, const int k,
const int k, const at::ScalarType output_data_type,
const at::ScalarType output_data_type, const QuantMode quant_mode,
const QuantMode quant_mode, const int iters)
const int iters)
{ {
const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant const bool per_token_quant =
|| quant_mode == QuantMode::PerTokenQuant; quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant const bool per_channel_quant =
|| quant_mode == QuantMode::PerChannelQuant; quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
const int row_scale_size = per_token_quant ? m : 1; const int row_scale_size = per_token_quant ? m : 1;
const int col_scale_size = per_channel_quant ? n : 1; const int col_scale_size = per_channel_quant ? n : 1;
...@@ -76,16 +75,16 @@ void int8_gemm_test( ...@@ -76,16 +75,16 @@ void int8_gemm_test(
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy"); ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy"); ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
auto x_gpu = x.to(at_int8).to(torch::kCUDA); auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous(); auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA); auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false)); auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false)); auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) * auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32)); * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) * auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32)); * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_torch = alpha_row_cultass.expand({m, 1}); auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
auto alpha_col_torch = alpha_col_cutlass.expand({1, n}); auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
...@@ -101,40 +100,41 @@ void int8_gemm_test( ...@@ -101,40 +100,41 @@ void int8_gemm_test(
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
// warm_up // warm_up
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu), cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu), get_ptr<int8_t>(w_T_gpu),
quant_mode, quant_mode,
get_ptr<float>(alpha_col_gpu), get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu), get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu), get_ptr<T>(y_gpu),
m, m,
n, n,
k, k,
nullptr, nullptr,
0, 0,
stream); stream);
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
"y_gpu_int32.npy");
ft::check_cuda_error(cudaStreamSynchronize(stream)); ft::check_cuda_error(cudaStreamSynchronize(stream));
auto start = high_resolution_clock::now(); auto start = high_resolution_clock::now();
for (int i = 0; i < iters; ++i) { for (int i = 0; i < iters; ++i) {
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu), cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu), get_ptr<int8_t>(w_T_gpu),
quant_mode, quant_mode,
get_ptr<float>(alpha_col_gpu), get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu), get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu), get_ptr<T>(y_gpu),
m, m,
n, n,
k, k,
nullptr, nullptr,
0, 0,
stream); stream);
} }
ft::check_cuda_error(cudaStreamSynchronize(stream)); ft::check_cuda_error(cudaStreamSynchronize(stream));
...@@ -142,27 +142,30 @@ void int8_gemm_test( ...@@ -142,27 +142,30 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start); auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) { if (torch::allclose(
(y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else { }
else {
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl; // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
// alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
} }
} }
int main(int argc, char **argv) int main(int argc, char** argv)
{ {
if (argc != 7) { if (argc != 7) {
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters."); TM_LOG_ERROR(
"arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0; return 0;
} }
const int m = atoi(argv[1]); const int m = atoi(argv[1]);
const int n = atoi(argv[2]); const int n = atoi(argv[2]);
const int k = atoi(argv[3]); const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
at::ScalarType::Half : at::ScalarType::BFloat16; const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
if (quant_mode == QuantMode::PerChannelQuant) { if (quant_mode == QuantMode::PerChannelQuant) {
printf("per channel quant \n"); printf("per channel quant \n");
} }
...@@ -170,7 +173,8 @@ int main(int argc, char **argv) ...@@ -170,7 +173,8 @@ int main(int argc, char **argv)
if (output_data_type == at::ScalarType::Half) { if (output_data_type == at::ScalarType::Half) {
int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters); int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
} else { }
else {
#if ENABLE_BF16 #if ENABLE_BF16
int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters); int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
#endif #endif
......
...@@ -20,7 +20,12 @@ FetchContent_Declare( ...@@ -20,7 +20,12 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/google/googletest.git GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1 GIT_TAG release-1.12.1
) )
add_definitions(-DTORCH_CUDA=1)
find_package(CUDAToolkit REQUIRED)
if (NOT MSVC)
add_definitions(-DTORCH_CUDA=1)
endif()
# For Windows: Prevent overriding the parent project's compiler/linker settings # For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
...@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14) ...@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
# Sorted by alphabetical order of test name. # Sorted by alphabetical order of test name.
target_link_libraries( # Libs for test_attention_kernels target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC unittest PUBLIC
-lcudart -lcurand CUDA::cudart CUDA::curand
gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger) gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC unittest PUBLIC
-lcudart CUDA::cudart
logprob_kernels memory_utils cuda_utils logger) logprob_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_penalty_kernels target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC unittest PUBLIC
-lcublas -lcublasLt -lcudart CUDA::cublas CUDA::cublasLt CUDA::cudart
sampling_penalty_kernels memory_utils cuda_utils logger) sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC unittest PUBLIC
-lcudart CUDA::cudart
sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger) sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
target_link_libraries( # Libs for test_sampling_layer target_link_libraries( # Libs for test_sampling_layer
unittest PUBLIC unittest PUBLIC
-lcublas -lcublasLt -lcudart CUDA::cublas CUDA::cublasLt CUDA::cudart
cublasMMWrapper memory_utils cublasMMWrapper memory_utils
DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger) DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
target_link_libraries( # Libs for test_tensor target_link_libraries( # Libs for test_tensor
...@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor ...@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor
remove_definitions(-DTORCH_CUDA=1) remove_definitions(-DTORCH_CUDA=1)
add_executable(test_gemm test_gemm.cu) add_executable(test_gemm test_gemm.cu)
target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger) target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
add_executable(test_gpt_kernels test_gpt_kernels.cu) add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC target_link_libraries(test_gpt_kernels PUBLIC
...@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC ...@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
add_executable(test_context_attention_layer test_context_attention_layer.cu) add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
unfused_attention_kernels unfused_attention_kernels
memory_utils tensor cublasMMWrapper cuda_utils logger) memory_utils tensor cublasMMWrapper cuda_utils logger)
...@@ -14,13 +14,12 @@ ...@@ -14,13 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "gtest_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "gtest_utils.h"
#include <curand.h> #include <curand.h>
#include <sstream> #include <sstream>
......
...@@ -336,35 +336,26 @@ int main(int argc, const char* argv[]) ...@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
// compute actual // compute actual
using AttentionOp = FlashAttentionOp<scalar_t>; using AttentionOp = FlashAttentionOp<scalar_t>;
using Layout = typename AttentionOp::AttentionLayout; using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = num_heads * seq_len * size_per_head, Layout layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
.stride_seq = size_per_head, Layout layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
.stride_head = seq_len * size_per_head}; Layout layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_k{.stride_batch = num_heads * key_len * size_per_head, Layout layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_v{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_o{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = num_heads * size_per_head,
.stride_head = size_per_head,
.use_seqlens = true};
AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head); AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true); float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
typename AttentionOp::Params attn_params{.attn_out = actual_out_ptr, typename AttentionOp::Params attn_params{actual_out_ptr,
.query = query_ptr, query_ptr,
.key = key_ptr, key_ptr,
.val = val_ptr, val_ptr,
.mask = mask_ptr, mask_ptr,
.out_accum = accum_buf_ptr, accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr, cu_seqlens_ptr,
.cu_seqlens_k = nullptr, nullptr,
.group_size = 1, 1,
.layout_q = layout_q, layout_q,
.layout_k = layout_k, layout_k,
.layout_v = layout_v, layout_v,
.layout_o = layout_o}; layout_o};
flash_attention(attn_params, stream); flash_attention(attn_params, stream);
sync_check_cuda_error(); sync_check_cuda_error();
......
#include <assert.h> #include <assert.h>
#include <math.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <math.h>
#include <numeric> #include <numeric>
#include <stdexcept> #include <stdexcept>
#include <tuple> #include <tuple>
...@@ -18,35 +18,38 @@ using namespace turbomind; ...@@ -18,35 +18,38 @@ using namespace turbomind;
// Can be replaced by the function provided by a test framework // Can be replaced by the function provided by a test framework
class TestFailureError : public std::exception { class TestFailureError: public std::exception {
private: private:
std::string msg_; std::string msg_;
public: public:
explicit TestFailureError() = default; explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") { explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str()); msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
} }
const char* what () const throw () { const char* what() const throw()
{
return msg_.c_str(); return msg_.c_str();
} }
}; };
#define EXPECT_TRUE(cond) \ #define EXPECT_TRUE(cond) \
do { if(!(cond)) { \ do { \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \ if (!(cond)) { \
__func__, __FILE__, __LINE__); \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } \
} while (false)
#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \
do { \ #define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \
bool is_ok = checkResult<dtype,ctype>(name, out, ref); \ do { \
if(!is_ok) { \ bool is_ok = checkResult<dtype, ctype>(name, out, ref); \
TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", \ if (!is_ok) { \
__func__, __FILE__, __LINE__); \ TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} \ } \
} while(false) } while (false)
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
...@@ -58,28 +61,29 @@ private: ...@@ -58,28 +61,29 @@ private:
public: public:
std::vector<size_t> shape; std::vector<size_t> shape;
DataType type; DataType type;
Tensor* tensor; Tensor* tensor;
void* data; void* data;
TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false) TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false)
{ {
this->allocator = allocator; this->allocator = allocator;
this->type = dtype; this->type = dtype;
this->shape = shape; this->shape = shape;
size_t tensor_memsize = this->memsize(); size_t tensor_memsize = this->memsize();
this->data = this->allocator->malloc(tensor_memsize, false); this->data = this->allocator->malloc(tensor_memsize, false);
if (zero_init) { if (zero_init) {
check_cuda_error(cudaMemset(data, 0x0, tensor_memsize)); check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
} else { }
else {
setRandomValues(); setRandomValues();
} }
this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data); this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
} }
TensorWrapper(TensorWrapper const& other) TensorWrapper(TensorWrapper const& other):
: allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor) allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
{ {
TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data); TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
} }
...@@ -91,13 +95,14 @@ public: ...@@ -91,13 +95,14 @@ public:
void setInvalidValues() void setInvalidValues()
{ {
size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half); size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
size_t tensor_size = type_size * tensor->size(); size_t tensor_size = type_size * tensor->size();
// Fill by a random number to guarantee invalid values // Fill by a random number to guarantee invalid values
check_cuda_error(cudaMemset(data, 0xdc, tensor_size)); check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
} }
void setRandomValues() { void setRandomValues()
{
// random initialization // random initialization
size_t num_elements = this->size(); size_t num_elements = this->size();
switch (this->type) { switch (this->type) {
...@@ -113,7 +118,8 @@ public: ...@@ -113,7 +118,8 @@ public:
} }
} }
size_t size() { size_t size()
{
size_t n_elements = 1; size_t n_elements = 1;
for (size_t s : this->shape) { for (size_t s : this->shape) {
n_elements *= s; n_elements *= s;
...@@ -121,7 +127,8 @@ public: ...@@ -121,7 +127,8 @@ public:
return n_elements; return n_elements;
} }
size_t memsize() { size_t memsize()
{
size_t type_size = 0; size_t type_size = 0;
switch (this->type) { switch (this->type) {
case TYPE_FP32: case TYPE_FP32:
...@@ -138,13 +145,13 @@ public: ...@@ -138,13 +145,13 @@ public:
}; };
template<DataType computeType> template<DataType computeType>
void computeReference(GemmOp transa, void computeReference(GemmOp transa,
GemmOp transb, GemmOp transb,
TensorWrapper& C, TensorWrapper& C,
TensorWrapper& A, TensorWrapper& A,
TensorWrapper& B, TensorWrapper& B,
float alpha = 1.0f, float alpha = 1.0f,
float beta = 0.0f) float beta = 0.0f)
{ {
size_t m = C.shape[0]; size_t m = C.shape[0];
size_t n = C.shape[1]; size_t n = C.shape[1];
...@@ -154,28 +161,36 @@ void computeReference(GemmOp transa, ...@@ -154,28 +161,36 @@ void computeReference(GemmOp transa,
size_t ldb = (transb == GEMM_OP_N) ? n : k; size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n; size_t ldc = n;
cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
half h_alpha = (half)alpha; half h_alpha = (half)alpha;
half h_beta = (half)beta; half h_beta = (half)beta;
const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha; const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta; const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
check_cuda_error(cublasGemmEx(cublas_handle, check_cuda_error(cublasGemmEx(cublas_handle,
getCublasOperation(transb), getCublasOperation(transb),
getCublasOperation(transa), getCublasOperation(transa),
n, m, k, n,
m,
k,
_alpha, _alpha,
(const void*)B.data, btype, ldb, (const void*)B.data,
(const void*)A.data, atype, lda, btype,
ldb,
(const void*)A.data,
atype,
lda,
_beta, _beta,
(void*)C.data, ctype, ldc, (void*)C.data,
ctype,
ldc,
compute_type, compute_type,
CUBLAS_GEMM_DEFAULT)); CUBLAS_GEMM_DEFAULT));
check_cuda_error(cublasDestroy(cublas_handle)); check_cuda_error(cublasDestroy(cublas_handle));
...@@ -199,13 +214,14 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) ...@@ -199,13 +214,14 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
} }
template<typename T> template<typename T>
bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) { bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol)
{
assert(out.type == ref.type); assert(out.type == ref.type);
size_t out_size = out.size(); size_t out_size = out.size();
size_t ref_size = ref.size(); size_t ref_size = ref.size();
T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size)); T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size)); T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost); cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost); cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost);
...@@ -219,7 +235,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa ...@@ -219,7 +235,7 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
bool ok = almostEqual(a, b, atol, rtol); bool ok = almostEqual(a, b, atol, rtol);
// Print the error. // Print the error.
if( !ok && failures < 4 ) { if (!ok && failures < 4) {
TM_LOG_ERROR(">> invalid result for i=%lu:", i); TM_LOG_ERROR(">> invalid result for i=%lu:", i);
TM_LOG_ERROR(">> found......: %10.6f", a); TM_LOG_ERROR(">> found......: %10.6f", a);
TM_LOG_ERROR(">> expected...: %10.6f", b); TM_LOG_ERROR(">> expected...: %10.6f", b);
...@@ -234,38 +250,46 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa ...@@ -234,38 +250,46 @@ bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, floa
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * out_size); size_t tol_failures = (size_t)(0.01 * out_size);
TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)", TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
name.c_str(), failures <= tol_failures ? "OK" : "FAILED", name.c_str(),
100. * failures / out_size, atol, rtol); failures <= tol_failures ? "OK" : "FAILED",
100. * failures / out_size,
atol,
rtol);
return failures <= tol_failures; return failures <= tol_failures;
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) { bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref)
float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f; {
float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f; float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
bool is_ok = false; float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
bool is_ok = false;
if (sizeof(T) == 4) { if (sizeof(T) == 4) {
is_ok = _checkResult<float>(name, out, ref, atol, rtol); is_ok = _checkResult<float>(name, out, ref, atol, rtol);
} else { }
else {
is_ok = _checkResult<half>(name, out, ref, atol, rtol); is_ok = _checkResult<half>(name, out, ref, atol, rtol);
} }
return is_ok; return is_ok;
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
bool checkResult(TensorWrapper& out, TensorWrapper& ref) { bool checkResult(TensorWrapper& out, TensorWrapper& ref)
{
return checkResult<T, computeType>("", out, ref); return checkResult<T, computeType>("", out, ref);
} }
template<typename T> template<typename T>
std::string toString() { std::string toString()
{
std::string str = "dtype="; std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16"; str += std::is_same<T, float>::value ? "FP32" : "FP16";
return str; return str;
} }
template<typename T, DataType ctype> template<typename T, DataType ctype>
std::string toString() { std::string toString()
{
std::string str = "dtype="; std::string str = "dtype=";
str += std::is_same<T, float>::value ? "FP32" : "FP16"; str += std::is_same<T, float>::value ? "FP32" : "FP16";
str += ", compute_type="; str += ", compute_type=";
...@@ -273,7 +297,8 @@ std::string toString() { ...@@ -273,7 +297,8 @@ std::string toString() {
return str; return str;
} }
std::string toString(GemmOp op) { std::string toString(GemmOp op)
{
return op == GEMM_OP_N ? "N" : "T"; return op == GEMM_OP_N ? "N" : "T";
} }
...@@ -282,38 +307,38 @@ struct GemmOpPair { ...@@ -282,38 +307,38 @@ struct GemmOpPair {
GemmOp transb; GemmOp transb;
}; };
static const std::vector<GemmOpPair> op_pairs {{GEMM_OP_N, GEMM_OP_N}, static const std::vector<GemmOpPair> op_pairs{
{GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}};
{GEMM_OP_T, GEMM_OP_N},
{GEMM_OP_T, GEMM_OP_T}};
static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k)
size_t m, size_t n, size_t k)
{ {
return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]", return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(), func_name,
m, n, k); getGemmOpString(transa).c_str(),
getGemmOpString(transb).c_str(),
m,
n,
k);
} }
static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k)
size_t m, size_t n, size_t k)
{ {
return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k); return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
} }
/////////////////////////////////// Unittests ////////////////////////////////////////// /////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
TM_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", {
m, n, k, toString<T, computeType>().c_str()); TM_LOG_INFO(
"Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
...@@ -322,72 +347,80 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -322,72 +347,80 @@ void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
computeReference<computeType>(op_pair.transa, op_pair.transb, computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
expected, a_tensor, b_tensor);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n; size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
a_tensor.data, a_tensor.type, lda, op_pair.transb,
b_tensor.data, b_tensor.type, ldb, m,
c_tensor.data, c_tensor.type, ldc); n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data); op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
} }
check_cuda_error(cudaStreamDestroy(stream)); check_cuda_error(cudaStreamDestroy(stream));
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { void testGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO(
m, n, k, toString<T, computeType>().c_str()); "Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
TensorWrapper expected(&allocator, dtype, {m, n}, true); TensorWrapper expected(&allocator, dtype, {m, n}, true);
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle)); check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream)); check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG); cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex(); std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle, cublasMMWrapper cublas_wrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
...@@ -396,7 +429,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -396,7 +429,7 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does. // Switch A/B because Gemm expects column major layout as cublas does.
...@@ -405,33 +438,50 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -405,33 +438,50 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
size_t ldc = n; size_t ldc = n;
cublas_wrapper.Gemm(getCublasOperation(op_pair.transb), cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
getCublasOperation(op_pair.transa), getCublasOperation(op_pair.transa),
n, m, k, n,
b_tensor.data, ldb, m,
a_tensor.data, lda, k,
expected.data, ldc); b_tensor.data,
ldb,
c_tensor.setInvalidValues(); // to guarantee C has invalid data a_tensor.data,
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, lda,
a_tensor.data, a_tensor.type, lda, expected.data,
b_tensor.data, b_tensor.type, ldb, ldc);
c_tensor.data, c_tensor.type, ldc);
c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_tensor.data,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
a_tensor.data, lda,
b_tensor.data, ldb,
c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
a_tensor.data, b_tensor.data, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data); op_pair.transb,
m,
n,
k,
a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
} }
...@@ -442,24 +492,28 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -442,24 +492,28 @@ void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
// batch of in/out tensors // batch of in/out tensors
DataType a_type = getTensorType<T>(); DataType a_type = getTensorType<T>();
DataType b_type = getTensorType<T>(); DataType b_type = getTensorType<T>();
DataType c_type = getTensorType<T>(); DataType c_type = getTensorType<T>();
std::vector<TensorWrapper*> a_tensors; std::vector<TensorWrapper*> a_tensors;
std::vector<TensorWrapper*> b_tensors; std::vector<TensorWrapper*> b_tensors;
std::vector<TensorWrapper*> c_tensors; std::vector<TensorWrapper*> c_tensors;
std::vector<TensorWrapper*> expecteds; std::vector<TensorWrapper*> expecteds;
const size_t batch_size = 3; const size_t batch_size = 3;
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false)); a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false));
b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false)); b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false));
...@@ -484,26 +538,21 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -484,26 +538,21 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
(const T*)expecteds[2]->data}; (const T*)expecteds[2]->data};
T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false)); T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
check_cuda_error(cudaMemcpyAsync( check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
(void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream)); const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs); const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4); void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8); void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle)); check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream)); check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG); cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex(); std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle, cublasMMWrapper cublas_wrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
...@@ -512,7 +561,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -512,7 +561,7 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_type, b_type, c_type, computeType); gemm->setTypes(a_type, b_type, c_type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
...@@ -526,42 +575,51 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -526,42 +575,51 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
n, n,
m, m,
k, k,
(const void* const*)batch_b, ldb, (const void* const*)batch_b,
(const void* const*)batch_a, lda, ldb,
(void* const*)batch_expected, ldc, (const void* const*)batch_a,
lda,
(void* const*)batch_expected,
ldc,
batch_size); batch_size);
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->batchedGemm(op_pair.transa,
batch_a, a_type, lda, op_pair.transb,
batch_b, b_type, ldb, m,
batch_c, c_type, ldc, n,
k,
batch_a,
a_type,
lda,
batch_b,
b_type,
ldb,
batch_c,
c_type,
ldc,
batch_size); batch_size);
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api1 batch" + std::to_string(i), EXPECT_ALMOST_EQUAL(
T, computeType, *c_tensors[i], *expecteds[i]); tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
} }
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues(); c_tensors[i]->setInvalidValues();
} }
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->batchedGemm(
batch_a, lda, op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size);
batch_b, ldb,
batch_c, ldc,
batch_size);
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api2 batch" + std::to_string(i), EXPECT_ALMOST_EQUAL(
T, computeType, *c_tensors[i], *expecteds[i]); tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
} }
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
c_tensors[i]->setInvalidValues(); c_tensors[i]->setInvalidValues();
} }
gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size);
batch_a, batch_b, batch_c, batch_size);
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
EXPECT_ALMOST_EQUAL(tc_name + " api3 batch" + std::to_string(i), EXPECT_ALMOST_EQUAL(
T, computeType, *c_tensors[i], *expecteds[i]); tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
} }
} }
a_tensors.clear(); a_tensors.clear();
...@@ -574,36 +632,36 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) { ...@@ -574,36 +632,36 @@ void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
check_cuda_error(cudaStreamDestroy(stream)); check_cuda_error(cudaStreamDestroy(stream));
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) { void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
batch_size, m, n, k, toString<T, computeType>().c_str()); batch_size,
m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
DataType data_type = getTensorType<T>(); DataType data_type = getTensorType<T>();
TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false); TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false);
TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false); TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false);
TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true); TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true);
TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true); TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true);
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle)); check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream)); check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG); cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex(); std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle, cublasMMWrapper cublas_wrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
...@@ -612,7 +670,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t ...@@ -612,7 +670,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
// Switch A/B because Gemm expects column major layout as cublas does. // Switch A/B because Gemm expects column major layout as cublas does.
...@@ -625,7 +683,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t ...@@ -625,7 +683,7 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
int64_t stridec = m * n; int64_t stridec = m * n;
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb), cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb),
getCublasOperation(op_pair.transa), getCublasOperation(op_pair.transa),
...@@ -650,35 +708,78 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t ...@@ -650,35 +708,78 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
getCublasDataType(computeType)); getCublasDataType(computeType));
c_tensor.setInvalidValues(); // to guarantee C has invalid data c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->stridedBatchedGemm(op_pair.transa,
a_tensor.data, a_tensor.type, lda, stridea, op_pair.transb,
b_tensor.data, b_tensor.type, ldb, strideb, m,
c_tensor.data, c_tensor.type, ldc, stridec, n,
batch_size, computeType, alpha, beta); k,
a_tensor.data,
a_tensor.type,
lda,
stridea,
b_tensor.data,
b_tensor.type,
ldb,
strideb,
c_tensor.data,
c_tensor.type,
ldc,
stridec,
batch_size,
computeType,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->stridedBatchedGemm(op_pair.transa,
a_tensor.data, lda, stridea, op_pair.transb,
b_tensor.data, ldb, strideb, m,
c_tensor.data, ldc, stridec, n,
batch_size, alpha, beta); k,
a_tensor.data,
lda,
stridea,
b_tensor.data,
ldb,
strideb,
c_tensor.data,
ldc,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->stridedBatchedGemm(op_pair.transa,
a_tensor.data, stridea, op_pair.transb,
b_tensor.data, strideb, m,
c_tensor.data, stridec, n,
batch_size, alpha, beta); k,
a_tensor.data,
stridea,
b_tensor.data,
strideb,
c_tensor.data,
stridec,
batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k, gemm->stridedBatchedGemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data, a_tensor.data,
b_tensor.data, b_tensor.data,
c_tensor.data, c_tensor.data,
batch_size, alpha, beta); batch_size,
alpha,
beta);
EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
} }
...@@ -692,15 +793,16 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t ...@@ -692,15 +793,16 @@ void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t
// The current SpGemm only supports TYPE_FP16 for T, computeType, // The current SpGemm only supports TYPE_FP16 for T, computeType,
// but let us keep these template variables for later use. // but let us keep these template variables for later use.
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
TM_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", {
m, n, k, toString<T, computeType>().c_str()); TM_LOG_INFO(
"Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
...@@ -709,47 +811,54 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -709,47 +811,54 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
// A/B will be switched in SpGemm. // A/B will be switched in SpGemm.
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues(); b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream, pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
computeReference<computeType>(op_pair.transa, op_pair.transb,
expected, a_tensor, b_tensor);
void* b_compressed; void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream, compressMatrixB(
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
op_pair.transb);
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
size_t ldc = n; size_t ldc = n;
c_tensor.setInvalidValues(); // to guarantee C has invalid data c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
a_tensor.data, a_tensor.type, lda, op_pair.transb,
b_compressed, b_tensor.type, ldb, m,
c_tensor.data, c_tensor.type, ldc); n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
op_pair.transb,
m,
n,
k,
a_tensor.data, a_tensor.data,
DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed}, DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
c_tensor.data); c_tensor.data);
...@@ -761,34 +870,34 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) { ...@@ -761,34 +870,34 @@ void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
} }
template<typename T, DataType computeType> template<typename T, DataType computeType>
void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k)
{
// Test if Gemm is consistent with cublasWrapper // Test if Gemm is consistent with cublasWrapper
TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
m, n, k, toString<T, computeType>().c_str()); m,
n,
k,
toString<T, computeType>().c_str());
Allocator<AllocatorType::CUDA> allocator(getDevice()); Allocator<AllocatorType::CUDA> allocator(getDevice());
cudaStream_t stream; cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
TensorWrapper expected(&allocator, dtype, {m, n}, true); TensorWrapper expected(&allocator, dtype, {m, n}, true);
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle)); check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream)); check_cuda_error(cublasSetStream(cublas_handle, stream));
cublasAlgoMap cublas_algo_map(GEMM_CONFIG); cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
std::mutex* cublas_wrapper_mutex = new std::mutex(); std::mutex* cublas_wrapper_mutex = new std::mutex();
cublasMMWrapper cublas_wrapper(cublas_handle, cublasMMWrapper cublas_wrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
&allocator);
cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
...@@ -797,13 +906,12 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -797,13 +906,12 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false); std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
for (auto &op_pair : op_pairs) { for (auto& op_pair : op_pairs) {
std::string tc_name = getTestName(__func__, op_pair, m, n, k); std::string tc_name = getTestName(__func__, op_pair, m, n, k);
TM_LOG_DEBUG(tc_name); TM_LOG_DEBUG(tc_name);
b_tensor.setRandomValues(); b_tensor.setRandomValues();
pruneMatrixB(b_tensor.data, stream, pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
// Switch A/B because Gemm expects column major layout as cublas does. // Switch A/B because Gemm expects column major layout as cublas does.
size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
...@@ -814,32 +922,40 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -814,32 +922,40 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
n, n,
m, m,
k, k,
b_tensor.data, ldb, b_tensor.data,
a_tensor.data, lda, ldb,
expected.data, ldc); a_tensor.data,
lda,
expected.data,
ldc);
void* b_compressed; void* b_compressed;
compressMatrixB(&b_compressed, allocator, stream, compressMatrixB(
b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
op_pair.transb);
c_tensor.setInvalidValues(); // to guarantee C has invalid data c_tensor.setInvalidValues(); // to guarantee C has invalid data
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa,
a_tensor.data, a_tensor.type, lda, op_pair.transb,
b_compressed, b_tensor.type, ldb, m,
c_tensor.data, c_tensor.type, ldc); n,
k,
a_tensor.data,
a_tensor.type,
lda,
b_compressed,
b_tensor.type,
ldb,
c_tensor.data,
c_tensor.type,
ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
a_tensor.data, lda,
b_compressed, ldb,
c_tensor.data, ldc);
EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
c_tensor.setInvalidValues(); c_tensor.setInvalidValues();
gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
a_tensor.data, b_compressed, c_tensor.data);
EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
} }
...@@ -850,18 +966,16 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) { ...@@ -850,18 +966,16 @@ void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
} }
#endif #endif
int main(int argc, char* argv[]) { int main(int argc, char* argv[])
{
// testGemmCreate(); // testGemmCreate();
using testcase_t = std::tuple<size_t, size_t, size_t>; using testcase_t = std::tuple<size_t, size_t, size_t>;
std::vector<testcase_t> testcases = {{16, 32, 64}, std::vector<testcase_t> testcases = {
{255, 255, 255}, {16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}};
{1041, 2047, 9999},
{1041, 1, 9999},
{1041, 999, 1}};
// Computation correctness tests // Computation correctness tests
for (testcase_t &tc : testcases) { for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc); size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc); size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc); size_t k = std::get<2>(tc);
...@@ -887,16 +1001,16 @@ int main(int argc, char* argv[]) { ...@@ -887,16 +1001,16 @@ int main(int argc, char* argv[]) {
// Reset for SpGemm test. // Reset for SpGemm test.
testcases.clear(); testcases.clear();
testcases.insert(testcases.end(), testcases.insert(testcases.end(),
{{8, 32, 32}, // minimum possible example. {{8, 32, 32}, // minimum possible example.
{8, 32, 64}, {8, 32, 64},
{64, 64, 64}, {64, 64, 64},
{16, 32, 64}, {16, 32, 64},
{1024, 32, 1024}, {1024, 32, 1024},
{1024, 1024, 32}, {1024, 1024, 32},
{16, 1024, 1024}, {16, 1024, 1024},
{1024, 1024, 1024}}); {1024, 1024, 1024}});
for (testcase_t &tc : testcases) { for (testcase_t& tc : testcases) {
size_t m = std::get<0>(tc); size_t m = std::get<0>(tc);
size_t n = std::get<1>(tc); size_t n = std::get<1>(tc);
size_t k = std::get<2>(tc); size_t k = std::get<2>(tc);
......
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
...@@ -39,13 +39,14 @@ protected: ...@@ -39,13 +39,14 @@ protected:
void testTransposition(); void testTransposition();
}; };
void fill_tensor_random(Tensor a) { void fill_tensor_random(Tensor a)
const size_t num_elems = a.size(); {
std::vector<int8_t> host_values(num_elems); const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
std::uniform_int_distribution<int8_t> int8_random(-128, 127); std::uniform_int_distribution<int8_t> int8_random(-128, 127);
std::mt19937 rng(0); std::mt19937 rng(0);
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); }); std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems); cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
} }
...@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition() ...@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
int8_t *a_data, *a_t_data; int8_t *a_data, *a_t_data;
cudaMalloc(&a_data, m * k * sizeof(int8_t)); cudaMalloc(&a_data, m * k * sizeof(int8_t));
Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data}; Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
fill_tensor_random(a); fill_tensor_random(a);
cudaMalloc(&a_t_data, k * m * sizeof(int8_t)); cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data}; Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
std::vector<int8_t> a_t_host_ref(a_t.size()); std::vector<int8_t> a_t_host_ref(a_t.size());
reference_transpose_host(a_t_host_ref, a); reference_transpose_host(a_t_host_ref, a);
......
#include <assert.h> #include <assert.h>
#include <math.h>
#include <float.h> #include <float.h>
#include <math.h>
#include <stdexcept> #include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#endif
#include "src/turbomind/kernels/logprob_kernels.h" #include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
...@@ -24,22 +25,26 @@ struct LogProbKernelTestParam { ...@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
size_t vocab_size; size_t vocab_size;
size_t beam_width; size_t beam_width;
std::string toString() { std::string toString()
{
return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]", return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
max_input_length, batch_size, vocab_size, beam_width); max_input_length,
batch_size,
vocab_size,
beam_width);
} }
}; };
/////////////////////////////////// Unittests ////////////////////////////////////////// /////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T> template<typename T>
class LogProbKernelTest : public FtTestBase { class LogProbKernelTest: public FtTestBase {
protected: protected:
void computeCumLogProbs(float* cum_log_probs, void computeCumLogProbs(float* cum_log_probs,
float* log_probs, float* log_probs,
const T* logits, const T* logits,
const int* input_ids, const int* input_ids,
const int* input_lengths, const int* input_lengths,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
const size_t vocab_size, const size_t vocab_size,
...@@ -54,9 +59,9 @@ protected: ...@@ -54,9 +59,9 @@ protected:
cum_log_probs[i] = 0.0f; cum_log_probs[i] = 0.0f;
} }
else if ((int)step < input_lengths[i]) { else if ((int)step < input_lengths[i]) {
size_t step_offset = (step - 1) * batch_size * vocab_size_padded; size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded; const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX; float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]); float val = static_cast<float>(vec[v]);
if (val > max_logits) { if (val > max_logits) {
...@@ -67,7 +72,7 @@ protected: ...@@ -67,7 +72,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits); sum += expf(static_cast<float>(vec[v]) - max_logits);
} }
int token_id = input_ids[step * batch_size + i]; int token_id = input_ids[step * batch_size + i];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum); float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) { if (log_probs != nullptr) {
log_probs[step * batch_size + i] = log_prob; log_probs[step * batch_size + i] = log_prob;
...@@ -78,11 +83,11 @@ protected: ...@@ -78,11 +83,11 @@ protected:
} }
} }
void computeCumLogProbsBatchFirst(float* cum_log_probs, void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs, float* log_probs,
const T* logits, const T* logits,
const int* input_ids, const int* input_ids,
const int* input_lengths, const int* input_lengths,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
const size_t vocab_size, const size_t vocab_size,
...@@ -98,8 +103,8 @@ protected: ...@@ -98,8 +103,8 @@ protected:
cum_log_probs[i] = 0.0f; cum_log_probs[i] = 0.0f;
} }
else if ((int)step < input_lengths[i]) { else if ((int)step < input_lengths[i]) {
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded; const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX; float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]); float val = static_cast<float>(vec[v]);
if (val > max_logits) { if (val > max_logits) {
...@@ -110,7 +115,7 @@ protected: ...@@ -110,7 +115,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits); sum += expf(static_cast<float>(vec[v]) - max_logits);
} }
int token_id = input_ids[i * max_input_length + step]; int token_id = input_ids[i * max_input_length + step];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum); float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) { if (log_probs != nullptr) {
log_probs[i * max_input_length + step] = log_prob; log_probs[i * max_input_length + step] = log_prob;
...@@ -122,17 +127,17 @@ protected: ...@@ -122,17 +127,17 @@ protected:
} }
public: public:
void runTest(LogProbKernelTestParam param)
void runTest(LogProbKernelTestParam param) { {
size_t max_input_length = param.max_input_length; size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width; size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does. // Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8); size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values // input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size]; T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam]; int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam]; int* h_input_lengths = new int[batchxbeam];
// output buffers // output buffers
...@@ -145,9 +150,9 @@ public: ...@@ -145,9 +150,9 @@ public:
memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam); memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
// device buffers // device buffers
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size)); T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam)); int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam)); float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers // initialize device buffers
...@@ -157,7 +162,7 @@ public: ...@@ -157,7 +162,7 @@ public:
deviceFill(d_cum_log_probs, batchxbeam, 0.0f); deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam; size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size); void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs, invokeLogProbFromLogits(d_cum_log_probs,
d_logits, d_logits,
d_input_ids, d_input_ids,
...@@ -189,16 +194,17 @@ public: ...@@ -189,16 +194,17 @@ public:
delete[] h_logits; delete[] h_logits;
} }
void runBatchFirstTest(LogProbKernelTestParam param) { void runBatchFirstTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length; size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width; size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does. // Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8); size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values // input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded]; T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam]; int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam]; int* h_input_lengths = new int[batchxbeam];
// output buffers // output buffers
...@@ -213,8 +219,8 @@ public: ...@@ -213,8 +219,8 @@ public:
// device buffers // device buffers
T* d_logits = T* d_logits =
reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded)); reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam)); int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam)); float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers // initialize device buffers
...@@ -224,7 +230,7 @@ public: ...@@ -224,7 +230,7 @@ public:
check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam)); check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam; size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size); void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs, invokeLogProbFromLogits(d_cum_log_probs,
d_logits, d_logits,
d_input_ids, d_input_ids,
...@@ -239,16 +245,16 @@ public: ...@@ -239,16 +245,16 @@ public:
true); true);
computeCumLogProbsBatchFirst(expected_cum_log_probs, computeCumLogProbsBatchFirst(expected_cum_log_probs,
nullptr, nullptr,
h_logits, h_logits,
h_input_ids, h_input_ids,
h_input_lengths, h_input_lengths,
max_input_length, max_input_length,
batchxbeam, batchxbeam,
vocab_size, vocab_size,
vocab_size_padded); vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)"); std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam); bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
delete[] expected_cum_log_probs; delete[] expected_cum_log_probs;
...@@ -256,10 +262,8 @@ public: ...@@ -256,10 +262,8 @@ public:
delete[] h_input_ids; delete[] h_input_ids;
delete[] h_logits; delete[] h_logits;
} }
}; };
TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes); TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
TYPED_TEST(LogProbKernelTest, SingleStep) TYPED_TEST(LogProbKernelTest, SingleStep)
......
...@@ -14,24 +14,24 @@ ...@@ -14,24 +14,24 @@
* limitations under the License. * limitations under the License.
*/ */
#include <algorithm> // std::min, std::max #include <algorithm> // std::min, std::max
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <math.h> // expf, log
#include <stdexcept> #include <stdexcept>
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // std::string #include <string> // std::string
#include <unordered_map> #include <unordered_map>
#include <vector> // std::vector #include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "gtest_utils.h"
#include "src/turbomind/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
using namespace turbomind; using namespace turbomind;
...@@ -41,21 +41,25 @@ struct TemperatureTestParam { ...@@ -41,21 +41,25 @@ struct TemperatureTestParam {
float* temperatures; float* temperatures;
size_t temperatures_size; size_t temperatures_size;
std::string toString() { std::string toString()
{
return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]", return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str()); batch_size,
vocab_size,
arr2str(temperatures, temperatures_size).c_str());
} }
}; };
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) { size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
{
return (vocab_size + pad - 1) / pad * pad; return (vocab_size + pad - 1) / pad * pad;
} }
template<typename T> template<typename T>
void applyRepetitonPenalty(T* logits, void applyRepetitonPenalty(T* logits,
const int* output_ids, const int* output_ids,
const int* input_lengths, const int* input_lengths,
const float repetition_penalty, const float repetition_penalty,
const size_t step, const size_t step,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
...@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits, ...@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
int token_id = output_ids[i + t * batch_size]; int token_id = output_ids[i + t * batch_size];
if (!penalized[token_id]) { if (!penalized[token_id]) {
float logit = static_cast<float>(logits[offset + token_id]); float logit = static_cast<float>(logits[offset + token_id]);
logits[offset + token_id] = static_cast<T>(logit < 0.0f ? logits[offset + token_id] =
logit * repetition_penalty : logit / repetition_penalty); static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
penalized[token_id] = true; penalized[token_id] = true;
} }
} }
...@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits, ...@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
} }
template<typename T> template<typename T>
void batchApplyRepetitonPenalty(T* logits, void batchApplyRepetitonPenalty(T* logits,
const int* output_ids, const int* output_ids,
const int* input_lengths, const int* input_lengths,
const float* repetition_penalties, const float* repetition_penalties,
const size_t step, const size_t step,
const size_t max_input_length, const size_t max_input_length,
...@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits, ...@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
} }
template<typename T> template<typename T>
void initLogitsAndBias(T* logits, void initLogitsAndBias(
T* bias, T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded)
{ {
initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f); initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
if (bias != nullptr) { if (bias != nullptr) {
...@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits, ...@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
} }
} }
/////////////////////////////////// Tests ////////////////////////////////////////// /////////////////////////////////// Tests //////////////////////////////////////////
template<typename T> template<typename T>
class TemperaturePenaltyTest : public FtTestBase { class TemperaturePenaltyTest: public FtTestBase {
protected: protected:
// Set up test // Set up test
size_t batch_size_; size_t batch_size_;
...@@ -157,17 +157,18 @@ protected: ...@@ -157,17 +157,18 @@ protected:
float* d_temperatures_; float* d_temperatures_;
void subsetup(TemperatureTestParam param) { void subsetup(TemperatureTestParam param)
batch_size_ = param.batch_size; {
vocab_size_ = param.vocab_size; batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_); vocab_size_padded_ = pad_vocab_size(vocab_size_);
h_logits_ = new T[batch_size_ * vocab_size_padded_]; h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_]; h_bias_ = new T[vocab_size_padded_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_); initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream); cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
if (param.temperatures_size > 1) { if (param.temperatures_size > 1) {
...@@ -177,7 +178,8 @@ protected: ...@@ -177,7 +178,8 @@ protected:
} }
} }
void subteardown() { void subteardown()
{
delete[] h_logits_; delete[] h_logits_;
delete[] h_bias_; delete[] h_bias_;
} }
...@@ -195,7 +197,7 @@ protected: ...@@ -195,7 +197,7 @@ protected:
ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature; ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
for (size_t j = 0; j < vocab_size; ++j) { for (size_t j = 0; j < vocab_size; ++j) {
size_t index = i * vocab_size_padded + j; size_t index = i * vocab_size_padded + j;
float logit = static_cast<float>(logits[index]); float logit = static_cast<float>(logits[index]);
if (bias != nullptr) { if (bias != nullptr) {
logit += static_cast<float>(bias[j]); logit += static_cast<float>(bias[j]);
} }
...@@ -204,29 +206,18 @@ protected: ...@@ -204,29 +206,18 @@ protected:
} }
} }
public: public:
void runTest(TemperatureTestParam param) void runTest(TemperatureTestParam param)
{ {
subsetup(param); subsetup(param);
// Do test // Do test
if (param.temperatures_size == 1) { if (param.temperatures_size == 1) {
invokeApplyTemperaturePenalty(d_logits_, invokeApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
} }
else { else {
invokeBatchApplyTemperaturePenalty(d_logits_, invokeBatchApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
} }
computeReference(h_logits_, computeReference(h_logits_,
h_bias_, h_bias_,
...@@ -240,21 +231,17 @@ public: ...@@ -240,21 +231,17 @@ public:
subteardown(); subteardown();
} }
void runConsistencyTest(TemperatureTestParam param) { void runConsistencyTest(TemperatureTestParam param)
{
// Set up test // Set up test
ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1"; ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
subsetup(param); subsetup(param);
// Run a single runtime value case. // Run a single runtime value case.
invokeApplyTemperaturePenalty(d_logits_, invokeApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
param.temperatures[0],
batch_size_, float temperature = param.temperatures[0];
vocab_size_,
vocab_size_padded_,
stream);
float temperature = param.temperatures[0];
float* h_temperatures = new float[batch_size_]; float* h_temperatures = new float[batch_size_];
for (size_t i = 0; i < batch_size_; ++i) { for (size_t i = 0; i < batch_size_; ++i) {
h_temperatures[i] = temperature; h_temperatures[i] = temperature;
...@@ -263,18 +250,14 @@ public: ...@@ -263,18 +250,14 @@ public:
cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream); cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream); cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
invokeBatchApplyTemperaturePenalty(d_logits_batch, invokeBatchApplyTemperaturePenalty(
d_bias_batch, d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
d_temperatures_, bool passed =
batch_size_, checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
vocab_size_,
vocab_size_padded_,
stream);
bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
// Tear down test // Tear down test
...@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab) ...@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty) TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 1.0f; temperatures[i] = 1.0f;
...@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty) ...@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne) TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 0.53f; temperatures[i] = 0.53f;
...@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne) ...@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne) TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f; temperatures[i] = 2.01f;
...@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne) ...@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(TemperaturePenaltyTest, BatchMixed) TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f; temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, temperatures, batch_size}); this->runTest({batch_size, 4, temperatures, batch_size});
} }
...@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase { ...@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
size_t repetition_penalties_size; size_t repetition_penalties_size;
RepetitionPenaltyType repetition_penalty_type; RepetitionPenaltyType repetition_penalty_type;
std::string toString() { std::string toString()
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map { {
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
{RepetitionPenaltyType::Additive, "additive"}, {RepetitionPenaltyType::Additive, "additive"},
{RepetitionPenaltyType::Multiplicative, "multiplicative"}, {RepetitionPenaltyType::Multiplicative, "multiplicative"},
{RepetitionPenaltyType::None, "none"}}; {RepetitionPenaltyType::None, "none"}};
return fmtstr( return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, " "repetition_penalties=%s, repetition_penalty_type=%s]",
"repetition_penalties=%s, repetition_penalty_type=%s]", batch_size,
batch_size, vocab_size, max_input_length, vocab_size,
arr2str(repetition_penalties, repetition_penalties_size).c_str(), max_input_length,
typestr_map.at(repetition_penalty_type).c_str()); arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
} }
}; };
template<typename T> template<typename T>
class RepetitionPenaltyTest : public FtTestBase { class RepetitionPenaltyTest: public FtTestBase {
protected: protected:
// Set up test // Set up test
size_t batch_size_; size_t batch_size_;
...@@ -392,37 +377,38 @@ protected: ...@@ -392,37 +377,38 @@ protected:
size_t sequence_length_; size_t sequence_length_;
size_t step_; size_t step_;
T* h_logits_; T* h_logits_;
T* h_bias_; T* h_bias_;
int* h_output_ids_; int* h_output_ids_;
int* h_input_lengths_; int* h_input_lengths_;
T* d_logits_; T* d_logits_;
T* d_bias_; T* d_bias_;
int* d_output_ids_; int* d_output_ids_;
int* d_input_lengths_; int* d_input_lengths_;
float* d_repetition_penalties_; float* d_repetition_penalties_;
void subsetup(RepetitionPenaltyTestCase param) { void subsetup(RepetitionPenaltyTestCase param)
batch_size_ = param.batch_size; {
vocab_size_ = param.vocab_size; batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_); vocab_size_padded_ = pad_vocab_size(vocab_size_);
max_input_length_ = param.max_input_length; max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7; step_ = sequence_length_ * 0.7;
h_logits_ = new T[batch_size_ * vocab_size_padded_]; h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_]; h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_]; h_output_ids_ = new int[sequence_length_ * batch_size_];
h_input_lengths_ = new int[batch_size_]; h_input_lengths_ = new int[batch_size_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_); initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_); initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_); initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_)); d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_)); d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
...@@ -437,7 +423,8 @@ protected: ...@@ -437,7 +423,8 @@ protected:
} }
} }
void subteardown() { void subteardown()
{
delete[] h_logits_; delete[] h_logits_;
delete[] h_bias_; delete[] h_bias_;
delete[] h_output_ids_; delete[] h_output_ids_;
...@@ -540,7 +527,8 @@ public: ...@@ -540,7 +527,8 @@ public:
subteardown(); subteardown();
} }
void runConsistencyTest(RepetitionPenaltyTestCase param) { void runConsistencyTest(RepetitionPenaltyTestCase param)
{
// Set up test // Set up test
ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1"; ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
subsetup(param); subsetup(param);
...@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab) ...@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty) TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 1.0f; repetition_penalties[i] = 1.0f;
...@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty) ...@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne) TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 0.53f; repetition_penalties[i] = 0.53f;
...@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne) ...@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne) TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f; temperatures[i] = 2.01f;
...@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne) ...@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(RepetitionPenaltyTest, BatchMixed) TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f; repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
} }
...@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency) ...@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive) TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f; repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
} }
...@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero) ...@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2) TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f; repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
......
#include <algorithm> // std::fill_n #include <algorithm> // std::fill_n
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <math.h> // expf, log
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // std::string #include <string> // std::string
#include <vector> // std::vector #include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
...@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) ...@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval); sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval; float logit = static_cast<float>(logits[idx]) - maxval;
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON)); probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
} }
} }
} }
...@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) ...@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval); sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval; float logit = static_cast<float>(logits[idx]) - maxval;
logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON)); logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
} }
} }
...@@ -119,10 +120,10 @@ public: ...@@ -119,10 +120,10 @@ public:
} }
protected: protected:
unsigned long long seed = 0; unsigned long long seed = 0;
cudaStream_t stream; cudaStream_t stream;
Allocator<AllocatorType::CUDA>* allocator; Allocator<AllocatorType::CUDA>* allocator;
curandState_t* curand_states; curandState_t* curand_states;
}; };
template<typename T> template<typename T>
...@@ -393,8 +394,8 @@ public: ...@@ -393,8 +394,8 @@ public:
{ {
this->runBatchTest(param, false, false); this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true); this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false); this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true); this->runBatchTest(param, true, true);
} }
}; };
...@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral) ...@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
this->runTest({6, 4, 1, 4, 1.0f, 1}); this->runTest({6, 4, 1, 4, 1.0f, 1});
}; };
TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63) TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
{ {
this->runTest({16, 51200, 1, 63, 1.0f, 8}); this->runTest({16, 51200, 1, 63, 1.0f, 8});
...@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP) ...@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
this->runBatchTest({8, 4000, 1, 63, 0.3f, 8}); this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
}; };
template<typename T> template<typename T>
class TopPSamplingKernelTest: public SamplingKernelTest<T> { class TopPSamplingKernelTest: public SamplingKernelTest<T> {
...@@ -473,7 +472,7 @@ public: ...@@ -473,7 +472,7 @@ public:
size_t batch_size = param.batch_size; size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
size_t output_len = param.output_len; size_t output_len = param.output_len;
size_t seq_len = output_len; size_t seq_len = output_len;
float top_p = param.top_p; float top_p = param.top_p;
...@@ -496,8 +495,8 @@ public: ...@@ -496,8 +495,8 @@ public:
struct cudaDeviceProp device_prop; struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device); cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>( curandState_t* curand_states =
allocator->malloc(sizeof(curandState_t) * batch_size, false)); reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream); invokeCurandInitialize(curand_states, batch_size, seed, stream);
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size)); int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
...@@ -515,17 +514,17 @@ public: ...@@ -515,17 +514,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1))); int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size)); int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0; size_t workspace_size = 0;
size_t cub_temp_storage_size = 0; size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel. // retrieve the workspace size of the top-p sampling kernel.
invokeTopPSampling<T>(nullptr, // workspace invokeTopPSampling<T>(nullptr, // workspace
workspace_size, workspace_size,
cub_temp_storage_size, cub_temp_storage_size,
nullptr, // output_ids nullptr, // output_ids
nullptr, // sequence_length nullptr, // sequence_length
nullptr, // finished_buffer nullptr, // finished_buffer
nullptr, // cum_log_probs nullptr, // cum_log_probs
nullptr, // output_log_probs nullptr, // output_log_probs
(T*)nullptr, // log_probs (T*)nullptr, // log_probs
topp_id_vals_buf, topp_id_vals_buf,
end_offsets, end_offsets,
...@@ -553,12 +552,7 @@ public: ...@@ -553,12 +552,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size); computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size); cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf, invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPSampling<T>(workspace, invokeTopPSampling<T>(workspace,
workspace_size, workspace_size,
...@@ -612,7 +606,7 @@ public: ...@@ -612,7 +606,7 @@ public:
size_t batch_size = param.batch_size; size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
float top_p = param.top_p; float top_p = param.top_p;
float* h_top_ps = new float[batch_size]; float* h_top_ps = new float[batch_size];
// Initialize runtime top k values. // Initialize runtime top k values.
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
...@@ -621,7 +615,7 @@ public: ...@@ -621,7 +615,7 @@ public:
float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size); float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
size_t output_len = param.output_len; size_t output_len = param.output_len;
size_t seq_len = output_len; size_t seq_len = output_len;
// Logit values in the host of shape (batch_size x vocab_size). // Logit values in the host of shape (batch_size x vocab_size).
T* h_logits = new T[batch_size * vocab_size]; T* h_logits = new T[batch_size * vocab_size];
...@@ -647,8 +641,8 @@ public: ...@@ -647,8 +641,8 @@ public:
struct cudaDeviceProp device_prop; struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device); cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>( curandState_t* curand_states =
allocator->malloc(sizeof(curandState_t) * batch_size, false)); reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream); invokeCurandInitialize(curand_states, batch_size, seed, stream);
float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size)); float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
...@@ -668,17 +662,17 @@ public: ...@@ -668,17 +662,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1))); int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size)); int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0; size_t workspace_size = 0;
size_t cub_temp_storage_size = 0; size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel. // retrieve the workspace size of the top-p sampling kernel.
invokeBatchTopPSampling<T>(nullptr, // workspace invokeBatchTopPSampling<T>(nullptr, // workspace
workspace_size, workspace_size,
cub_temp_storage_size, cub_temp_storage_size,
nullptr, // output_ids nullptr, // output_ids
nullptr, // sequence_length nullptr, // sequence_length
nullptr, // finished_buffer nullptr, // finished_buffer
nullptr, // cum_log_probs nullptr, // cum_log_probs
nullptr, // output_log_probs nullptr, // output_log_probs
(T*)nullptr, // log_probs (T*)nullptr, // log_probs
topp_id_vals_buf, topp_id_vals_buf,
end_offsets, end_offsets,
...@@ -709,12 +703,7 @@ public: ...@@ -709,12 +703,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size); computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size); cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf, invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeBatchTopPSampling<T>(workspace, invokeBatchTopPSampling<T>(workspace,
workspace_size, workspace_size,
...@@ -773,8 +762,8 @@ public: ...@@ -773,8 +762,8 @@ public:
{ {
this->runBatchTest(param, false, false); this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true); this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false); this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true); this->runBatchTest(param, true, true);
} }
}; };
...@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2) ...@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16}); this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
}; };
__global__ __global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) { {
int idx = threadIdx.x; int idx = threadIdx.x;
if (idx < batch_size) { if (idx < batch_size) {
vals[idx] = curand(states + idx); vals[idx] = curand(states + idx);
} }
} }
TEST(SamplingKernelTest, CurandBatchInitialize) { TEST(SamplingKernelTest, CurandBatchInitialize)
size_t batch_size = 127; {
size_t batch_size = 127;
cudaStream_t stream; cudaStream_t stream;
cudaStreamCreate(&stream); cudaStreamCreate(&stream);
curandState_t* curand_states; curandState_t* curand_states;
check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size)); check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
unsigned long long* h_random_seeds = new unsigned long long[batch_size]; unsigned long long* h_random_seeds = new unsigned long long[batch_size];
const size_t period_size = 3; const size_t period_size = 3;
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
h_random_seeds[i] = i / period_size; h_random_seeds[i] = i / period_size;
} }
unsigned long long* d_random_seeds; unsigned long long* d_random_seeds;
check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size)); check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds, check_cuda_error(
sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice)); cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
// Initialize curand states. // Initialize curand states.
invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream); invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
...@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) { ...@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
unsigned int* h_rand_vals = new unsigned int[batch_size]; unsigned int* h_rand_vals = new unsigned int[batch_size];
check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size)); check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size); generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
check_cuda_error(cudaMemcpyAsync( check_cuda_error(
h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream)); cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(cudaStreamSynchronize(stream)); check_cuda_error(cudaStreamSynchronize(stream));
// The same seed produces the same random number. // The same seed produces the same random number.
......
#include <algorithm> // std::min, std::max #include <algorithm> // std::min, std::max
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <math.h> // expf, log
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // std::string #include <string> // std::string
#include <vector> // std::vector #include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "gtest_utils.h" #include "gtest_utils.h"
...@@ -26,17 +27,24 @@ struct SamplingLayerTestParam { ...@@ -26,17 +27,24 @@ struct SamplingLayerTestParam {
size_t vocab_size; size_t vocab_size;
size_t beam_width; size_t beam_width;
size_t top_k; size_t top_k;
float top_p; float top_p;
size_t output_len; size_t output_len;
std::string toString() { std::string toString()
{
return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]", return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
batch_size, vocab_size, beam_width, top_k, top_p, output_len); batch_size,
vocab_size,
beam_width,
top_k,
top_p,
output_len);
} }
}; };
template<typename T> template<typename T>
void computeProb(T* probs, T* logits, int batch_size, int vocab_size) { void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits. // Compute the log probability from logits.
// logits = batch_size x vocab_size vector. // logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension) // logprobs = log(softmax(logits)) (softmax along with vocab dimension)
...@@ -46,14 +54,15 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) { ...@@ -46,14 +54,15 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
sum += expf((float)logits[bidx * vocab_size + i]); sum += expf((float)logits[bidx * vocab_size + i]);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON)); probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON));
} }
} }
} }
template<typename T> template<typename T>
void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) { void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
{
// Compute the log probability from logits. // Compute the log probability from logits.
// logits = batch_size x vocab_size vector. // logits = batch_size x vocab_size vector.
// logprobs = log(softmax(logits)) (softmax along with vocab dimension) // logprobs = log(softmax(logits)) (softmax along with vocab dimension)
...@@ -63,7 +72,7 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) { ...@@ -63,7 +72,7 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
sum += expf(logits[bidx * vocab_size + i]); sum += expf(logits[bidx * vocab_size + i]);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON)); logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON));
} }
} }
...@@ -72,44 +81,45 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) { ...@@ -72,44 +81,45 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
template<typename T> template<typename T>
class SamplingDecodeTest: public testing::Test { class SamplingDecodeTest: public testing::Test {
protected: protected:
unsigned long long seed = 0; unsigned long long seed = 0;
const static unsigned long long max_seed = 30; const static unsigned long long max_seed = 30;
const size_t batch_size = 6; const size_t batch_size = 6;
const size_t beam_width = 1; const size_t beam_width = 1;
const size_t batchxbeam = batch_size * beam_width; const size_t batchxbeam = batch_size * beam_width;
const size_t vocab_size = 8; const size_t vocab_size = 8;
const size_t max_input_len = 0; // has no effect. const size_t max_input_len = 0; // has no effect.
const size_t max_output_len = 3; const size_t max_output_len = 3;
const size_t max_seq_len = max_input_len + max_output_len; const size_t max_seq_len = max_input_len + max_output_len;
const int end_id = vocab_size - 1; const int end_id = vocab_size - 1;
const DataType data_type = getTensorType<T>(); const DataType data_type = getTensorType<T>();
// vocab size 8 & length 3 // vocab size 8 & length 3
T* test_input_logits; T* test_input_logits;
cudaStream_t stream; cudaStream_t stream;
ft::Allocator<ft::AllocatorType::CUDA>* allocator; ft::Allocator<ft::AllocatorType::CUDA>* allocator;
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
std::mutex *cublas_wrapper_mutex; std::mutex* cublas_wrapper_mutex;
cublasMMWrapper *cublas_wrapper; cublasMMWrapper* cublas_wrapper;
DynamicDecodeLayer<T> *dynamic_decode_layer; DynamicDecodeLayer<T>* dynamic_decode_layer;
int* h_output_ids; int* h_output_ids;
T* h_logits; T* h_logits;
T* h_probs; T* h_probs;
T* h_log_probs; T* h_log_probs;
float* h_cum_log_probs; float* h_cum_log_probs;
float* h_output_log_probs; float* h_output_log_probs;
T* d_logits; T* d_logits;
int* d_input_lengths; int* d_input_lengths;
float* d_cum_log_probs; float* d_cum_log_probs;
float* d_output_log_probs; float* d_output_log_probs;
int* d_output_ids; int* d_output_ids;
int* d_end_ids; int* d_end_ids;
void setup(unsigned long long seed = 0) { void setup(unsigned long long seed = 0)
{
this->seed = seed; this->seed = seed;
check_cuda_error(cudaStreamCreate(&stream)); check_cuda_error(cudaStreamCreate(&stream));
...@@ -124,12 +134,8 @@ protected: ...@@ -124,12 +134,8 @@ protected:
cublasAlgoMap cublas_algo_map(GEMM_CONFIG); cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
cublas_wrapper_mutex = new std::mutex(); cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle, cublas_wrapper = new cublasMMWrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
stream,
&cublas_algo_map,
cublas_wrapper_mutex,
allocator);
dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size, dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size, vocab_size,
...@@ -140,26 +146,26 @@ protected: ...@@ -140,26 +146,26 @@ protected:
false, // is_free_buffer_after_forward false, // is_free_buffer_after_forward
&prop); // cuda_device_prop &prop); // cuda_device_prop
h_output_ids = new int[batchxbeam]; h_output_ids = new int[batchxbeam];
h_logits = new T[batchxbeam * vocab_size]; h_logits = new T[batchxbeam * vocab_size];
h_probs = new T[batchxbeam * vocab_size]; h_probs = new T[batchxbeam * vocab_size];
h_log_probs = new T[batchxbeam * vocab_size]; h_log_probs = new T[batchxbeam * vocab_size];
h_cum_log_probs = new float[batchxbeam]; h_cum_log_probs = new float[batchxbeam];
h_output_log_probs = new float[max_output_len * batchxbeam]; h_output_log_probs = new float[max_output_len * batchxbeam];
// prob = (0.4, 0.3, 0.2, 0.1, ...) // prob = (0.4, 0.3, 0.2, 0.1, ...)
test_input_logits = new T[24]{ test_input_logits = new T[24]{
-0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, // step 0 -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, // step 0
-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1 -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1
-FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX // step 2 -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX // step 2
}; };
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true)); d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam)); d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam)); d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam)); d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
// Init by zero. // Init by zero.
cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam); cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam);
...@@ -168,7 +174,8 @@ protected: ...@@ -168,7 +174,8 @@ protected:
deviceFill(d_end_ids, batchxbeam, end_id, stream); deviceFill(d_end_ids, batchxbeam, end_id, stream);
} }
void teardown() { void teardown()
{
delete[] test_input_logits; delete[] test_input_logits;
delete[] h_output_ids; delete[] h_output_ids;
delete[] h_logits; delete[] h_logits;
...@@ -185,12 +192,8 @@ protected: ...@@ -185,12 +192,8 @@ protected:
check_cuda_error(cudaStreamDestroy(stream)); check_cuda_error(cudaStreamDestroy(stream));
} }
TensorMap* createInputTensors(int* topk, TensorMap* createInputTensors(
size_t topk_size, int* topk, size_t topk_size, float* topp, size_t topp_size, float* temperature, float* repetition_penalty)
float* topp,
size_t topp_size,
float* temperature,
float* repetition_penalty)
{ {
// construct common input tensors // construct common input tensors
TensorMap* input_tensors = new TensorMap(); TensorMap* input_tensors = new TensorMap();
...@@ -206,16 +209,19 @@ protected: ...@@ -206,16 +209,19 @@ protected:
if (repetition_penalty != nullptr) { if (repetition_penalty != nullptr) {
input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}}); input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
} }
input_tensors->insert({"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}}); input_tensors->insert(
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}}); input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}}); input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
input_tensors->insert({"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}}); input_tensors->insert(
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}}); input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}}); input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
return input_tensors; return input_tensors;
} }
TensorMap* createOutputTensors() { TensorMap* createOutputTensors()
{
// construct common output tensors // construct common output tensors
TensorMap* output_tensors = new TensorMap(); TensorMap* output_tensors = new TensorMap();
output_tensors->insert( output_tensors->insert(
...@@ -225,26 +231,27 @@ protected: ...@@ -225,26 +231,27 @@ protected:
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}}); {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}});
output_tensors->insert( output_tensors->insert(
{"output_log_probs", {"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}}); Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
output_tensors->insert( output_tensors->insert({"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
return output_tensors; return output_tensors;
} }
void batchH2Dcpy(T* dst, T* src, size_t m, size_t n) { void batchH2Dcpy(T* dst, T* src, size_t m, size_t n)
{
for (size_t i = 0; i < m; ++i) { for (size_t i = 0; i < m; ++i) {
cudaH2Dcpy(dst + i * n, src, n); cudaH2Dcpy(dst + i * n, src, n);
} }
} }
bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids) { bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids)
{
assert(expected_ids.size() == max_seq_len * batchxbeam); assert(expected_ids.size() == max_seq_len * batchxbeam);
int* h_output_ids = new int[max_seq_len * batchxbeam]; int* h_output_ids = new int[max_seq_len * batchxbeam];
cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam); cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
int failures = 0; int failures = 0;
for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) { for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) {
size_t s = i / batchxbeam; size_t s = i / batchxbeam;
size_t b = i % batchxbeam; size_t b = i % batchxbeam;
std::set<int> expts = expected_ids.at(i); std::set<int> expts = expected_ids.at(i);
if (expts.count(h_output_ids[i]) == 0) { if (expts.count(h_output_ids[i]) == 0) {
if (failures < 10) { if (failures < 10) {
...@@ -260,29 +267,29 @@ protected: ...@@ -260,29 +267,29 @@ protected:
++failures; ++failures;
} }
} }
TM_LOG_DEBUG("check...%6s : failures: %d / %d", TM_LOG_DEBUG(
failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam); "check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
delete[] h_output_ids; delete[] h_output_ids;
return failures == 0; return failures == 0;
} }
public: public:
void runTest(std::vector<std::set<int>> expected_output_ids, void runTest(std::vector<std::set<int>> expected_output_ids,
int* top_ks, int* top_ks,
size_t top_k_size, size_t top_k_size,
float* top_ps, float* top_ps,
size_t top_p_size, size_t top_p_size,
float* temperature, float* temperature,
float* repetition_penalty, float* repetition_penalty,
bool use_local_batch = false) bool use_local_batch = false)
{ {
size_t local_batch_size = use_local_batch ? batch_size / 3 : batch_size; size_t local_batch_size = use_local_batch ? batch_size / 3 : batch_size;
uint ite = use_local_batch ? 1 : 0; uint ite = use_local_batch ? 1 : 0;
for (unsigned long long seed = 0; seed < max_seed; ++seed) { for (unsigned long long seed = 0; seed < max_seed; ++seed) {
this->setup(seed); this->setup(seed);
size_t step = max_input_len; size_t step = max_input_len;
TensorMap* input_tensors = createInputTensors( TensorMap* input_tensors =
top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty); createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}); input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}); input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}}); input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
...@@ -316,27 +323,57 @@ TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes); ...@@ -316,27 +323,57 @@ TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes);
TYPED_TEST(SamplingDecodeTest, TopK) TYPED_TEST(SamplingDecodeTest, TopK)
{ {
int top_k = 2; int top_k = 2;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
// 0 1 2 3 4 5 // 0 1 2 3 4 5
{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, // step 0 {0, 1},
{4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, // step 1 {0, 1},
{2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3} // step 2 {0, 1},
{0, 1},
{0, 1},
{0, 1}, // step 0
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5},
{4, 5}, // step 1
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, BatchTopK) TYPED_TEST(SamplingDecodeTest, BatchTopK)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1}; int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
// 0 1 2 3 4 5 // 0 1 2 3 4 5
{0, 1}, {0}, {0}, {0, 1}, {0}, {0}, // step 0 {0, 1},
{4, 5}, {4}, {4}, {4, 5}, {4}, {4}, // step 1 {0},
{2, 3}, {2}, {2}, {2, 3}, {2}, {2} // step 2 {0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2},
{2},
{2, 3},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
...@@ -344,52 +381,112 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK) ...@@ -344,52 +381,112 @@ TYPED_TEST(SamplingDecodeTest, BatchTopK)
TYPED_TEST(SamplingDecodeTest, TopP) TYPED_TEST(SamplingDecodeTest, TopP)
{ {
float top_p = 0.3; float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, BatchTopP) TYPED_TEST(SamplingDecodeTest, BatchTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f}; float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
{0}, {0, 1}, {0, 1}, {0}, {0, 1}, {0, 1}, // step 0 {0},
{4}, {4, 5}, {4, 5}, {4}, {4, 5}, {4, 5}, // step 1 {0, 1},
{2}, {2, 3}, {2, 3}, {2}, {2, 3}, {2, 3} // step 2 {0, 1},
{0},
{0, 1},
{0, 1}, // step 0
{4},
{4, 5},
{4, 5},
{4},
{4, 5},
{4, 5}, // step 1
{2},
{2, 3},
{2, 3},
{2},
{2, 3},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, TopKTopP) { TYPED_TEST(SamplingDecodeTest, TopKTopP)
int top_k = 2; {
float top_p = 0.3; int top_k = 2;
std::vector<std::set<int>> expected_output_ids { float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, BatchTopKTopP) TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1}; int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
float top_p = 0.3; float top_p = 0.3;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
...@@ -397,29 +494,59 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKTopP) ...@@ -397,29 +494,59 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
TYPED_TEST(SamplingDecodeTest, TopKBatchTopP) TYPED_TEST(SamplingDecodeTest, TopKBatchTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int top_k = 2; int top_k = 2;
float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5}; float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0 {0, 1},
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1 {0},
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2 {0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP) TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0}; int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5}; float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0 {0, 1},
{4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1 {0},
{2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3} // step 2 {0, 1},
{0, 1},
{0},
{0, 1}, // step 0
{4, 5},
{4},
{4, 5},
{4, 5},
{4},
{4, 5}, // step 1
{2, 3},
{2},
{2, 3},
{2, 3},
{2},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
...@@ -428,162 +555,351 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP) ...@@ -428,162 +555,351 @@ TYPED_TEST(SamplingDecodeTest, BatchTopKBatchTopP)
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK) TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int top_k = 0; int top_k = 0;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP) TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
float top_p = 0; float top_p = 0;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP) TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP)
{ {
size_t batch_size = this->batch_size; size_t batch_size = this->batch_size;
int top_k = 0; int top_k = 0;
float top_p = 0; float top_p = 0;
std::vector<std::set<int>> expected_output_ids { std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP) { TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP)
size_t batch_size = this->batch_size; {
int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0}; size_t batch_size = this->batch_size;
float top_p = 0; int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
std::vector<std::set<int>> expected_output_ids { float top_p = 0;
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP) { TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP)
size_t batch_size = this->batch_size; {
int top_k = 0; size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; int top_k = 0;
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0}, {0}, {0}, // step 0 {0},
{4}, {4}, {4}, {4}, {4}, {4}, // step 1 {0},
{2}, {2}, {2}, {2}, {2}, {2} // step 2 {0},
{0},
{0},
{0}, // step 0
{4},
{4},
{4},
{4},
{4},
{4}, // step 1
{2},
{2},
{2},
{2},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero) { TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero)
size_t batch_size = this->batch_size; {
int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1}; size_t batch_size = this->batch_size;
std::vector<std::set<int>> expected_output_ids { int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0, 1}, {0}, {0}, {0}, {0, 1}, {0}, // step 0 {0, 1},
{4, 5}, {4}, {4}, {4}, {4, 5}, {4}, // step 1 {0},
{2, 3}, {2}, {2}, {2}, {2, 3}, {2} // step 2 {0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2},
{2},
{2},
{2, 3},
{2} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero) { TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero)
size_t batch_size = this->batch_size; {
float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f}; size_t batch_size = this->batch_size;
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0, 1}, {0, 1}, {0}, {0, 1}, {0}, {0}, // step 0 {0, 1},
{4, 5}, {4, 5}, {4}, {4, 5}, {4}, {4}, // step 1 {0, 1},
{2, 3}, {2, 3}, {2}, {2, 3}, {2}, {2} // step 2 {0},
{0, 1},
{0},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4, 5},
{4},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2, 3},
{2},
{2} // step 2
}; };
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero) { TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero)
size_t batch_size = this->batch_size; {
int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0}; size_t batch_size = this->batch_size;
float top_p = 0.0; int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
std::vector<std::set<int>> expected_output_ids { float top_p = 0.0;
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0, 1}, {0, 1}, {0}, {0}, {0, 1}, {0}, // step 0 {0, 1},
{4, 5}, {4, 5}, {4}, {4}, {4, 5}, {4}, // step 1 {0, 1},
{2, 3}, {2, 3}, {2}, {2}, {2, 3}, {2} // step 2 {0},
{0},
{0, 1},
{0}, // step 0
{4, 5},
{4, 5},
{4},
{4},
{4, 5},
{4}, // step 1
{2, 3},
{2, 3},
{2},
{2},
{2, 3},
{2} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero) { TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero)
size_t batch_size = this->batch_size; {
int top_k = 0; size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5}; int top_k = 0;
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0, 1}, {0}, {0}, {0, 1}, // step 0 {0},
{4}, {4}, {4, 5}, {4}, {4}, {4, 5}, // step 1 {0},
{2}, {2}, {2, 3}, {2}, {2}, {2, 3} // step 2 {0, 1},
{0},
{0},
{0, 1}, // step 0
{4},
{4},
{4, 5},
{4},
{4},
{4, 5}, // step 1
{2},
{2},
{2, 3},
{2},
{2},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero) { TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero)
size_t batch_size = this->batch_size; {
int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0}; size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5}; int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0}, {0, 1}, {0}, {0, 1}, // step 0 {0},
{4}, {4}, {4}, {4, 5}, {4}, {4, 5}, // step 1 {0},
{2}, {2}, {2}, {2, 3}, {2}, {2, 3} // step 2 {0},
{0, 1},
{0},
{0, 1}, // step 0
{4},
{4},
{4},
{4, 5},
{4},
{4, 5}, // step 1
{2},
{2},
{2},
{2, 3},
{2},
{2, 3} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr); this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
delete[] top_ks; delete[] top_ks;
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP) { TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP)
size_t batch_size = this->batch_size; {
float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f}; size_t batch_size = this->batch_size;
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
{0}, {0}, {0, 1}, {0}, {0}, {0}, // step 0 std::vector<std::set<int>> expected_output_ids{
{0}, {0}, {4, 5}, {4}, {0}, {0}, // step 1 {0},
{0}, {0}, {2, 3}, {2}, {0}, {0} // step 2 {0},
{0, 1},
{0},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2},
{0},
{0} // step 2
}; };
this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true); this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ps; delete[] top_ps;
} }
TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP) { TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP)
size_t batch_size = this->batch_size; {
int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0}; size_t batch_size = this->batch_size;
float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5}; int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
std::vector<std::set<int>> expected_output_ids { float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
std::vector<std::set<int>> expected_output_ids{
// batch // batch
{0}, {0}, {0, 1}, {0, 1}, {0}, {0}, // step 0 {0},
{0}, {0}, {4, 5}, {4, 5}, {0}, {0}, // step 1 {0},
{0}, {0}, {2, 3}, {2, 3}, {0}, {0} // step 2 {0, 1},
{0, 1},
{0},
{0}, // step 0
{0},
{0},
{4, 5},
{4, 5},
{0},
{0}, // step 1
{0},
{0},
{2, 3},
{2, 3},
{0},
{0} // step 2
}; };
this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true); this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true);
delete[] top_ks; delete[] top_ks;
...@@ -601,15 +917,10 @@ public: ...@@ -601,15 +917,10 @@ public:
check_cuda_error(cublasCreate(&cublas_handle)); check_cuda_error(cublasCreate(&cublas_handle));
check_cuda_error(cublasLtCreate(&cublaslt_handle)); check_cuda_error(cublasLtCreate(&cublaslt_handle));
check_cuda_error(cublasSetStream(cublas_handle, stream)); check_cuda_error(cublasSetStream(cublas_handle, stream));
cublas_algo_map = new cublasAlgoMap(""); cublas_algo_map = new cublasAlgoMap("");
cublas_wrapper_mutex = new std::mutex(); cublas_wrapper_mutex = new std::mutex();
cublas_wrapper = new cublasMMWrapper(cublas_handle, cublas_wrapper = new cublasMMWrapper(
cublaslt_handle, cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, allocator);
stream,
cublas_algo_map,
cublas_wrapper_mutex,
allocator);
} }
void TearDown() override void TearDown() override
{ {
...@@ -626,12 +937,11 @@ protected: ...@@ -626,12 +937,11 @@ protected:
using FtTestBase::allocator; using FtTestBase::allocator;
struct cudaDeviceProp prop; struct cudaDeviceProp prop;
cublasHandle_t cublas_handle; cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle; cublasLtHandle_t cublaslt_handle;
cublasAlgoMap* cublas_algo_map; cublasAlgoMap* cublas_algo_map;
std::mutex* cublas_wrapper_mutex; std::mutex* cublas_wrapper_mutex;
cublasMMWrapper* cublas_wrapper; cublasMMWrapper* cublas_wrapper;
DataType data_type = getTensorType<T>(); DataType data_type = getTensorType<T>();
...@@ -643,50 +953,50 @@ protected: ...@@ -643,50 +953,50 @@ protected:
size_t max_output_len; size_t max_output_len;
size_t max_seq_len; size_t max_seq_len;
uint top_k; uint top_k;
float top_p; float top_p;
float temperature; float temperature;
float repetition_penalty; float repetition_penalty;
int end_id; int end_id;
T* h_logits; T* h_logits;
T* h_probs; T* h_probs;
T* h_log_probs; T* h_log_probs;
float* h_cum_log_probs; float* h_cum_log_probs;
float* h_output_log_probs; float* h_output_log_probs;
int* h_output_ids; int* h_output_ids;
T* d_logits; T* d_logits;
int* d_input_lengths; int* d_input_lengths;
float* d_cum_log_probs; float* d_cum_log_probs;
float* d_output_log_probs; float* d_output_log_probs;
int* d_output_ids; int* d_output_ids;
int* d_end_ids; int* d_end_ids;
void setup(SamplingLayerTestParam param) void setup(SamplingLayerTestParam param)
{ {
batch_size = param.batch_size; batch_size = param.batch_size;
beam_width = param.beam_width; beam_width = param.beam_width;
batchxbeam = batch_size * param.beam_width; batchxbeam = batch_size * param.beam_width;
vocab_size = param.vocab_size; vocab_size = param.vocab_size;
max_input_len = 0; max_input_len = 0;
max_output_len = param.output_len; max_output_len = param.output_len;
max_seq_len = max_input_len + max_output_len; max_seq_len = max_input_len + max_output_len;
top_k = param.top_k; top_k = param.top_k;
top_p = param.top_p; top_p = param.top_p;
// use default values having no effect. // use default values having no effect.
temperature = 1.0f; temperature = 1.0f;
repetition_penalty = 1.0f; repetition_penalty = 1.0f;
end_id = 0; end_id = 0;
h_logits = new T[batchxbeam * vocab_size]; h_logits = new T[batchxbeam * vocab_size];
h_output_ids = new int[batchxbeam]; h_output_ids = new int[batchxbeam];
d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size)); d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam)); d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size)); d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
// Init by zero. // Init by zero.
deviceFill(d_input_lengths, batchxbeam, 0, stream); deviceFill(d_input_lengths, batchxbeam, 0, stream);
...@@ -694,14 +1004,13 @@ protected: ...@@ -694,14 +1004,13 @@ protected:
deviceFill(d_end_ids, batch_size, end_id); deviceFill(d_end_ids, batch_size, end_id);
} }
void teardown() { void teardown()
{
delete[] h_logits; delete[] h_logits;
delete[] h_output_ids; delete[] h_output_ids;
} }
void runCurandTest(SamplingLayerTestParam param, void runCurandTest(SamplingLayerTestParam param, bool use_local_batch, bool use_single_random_seed)
bool use_local_batch,
bool use_single_random_seed)
{ {
setup(param); setup(param);
const DataType data_type = getTensorType<T>(); const DataType data_type = getTensorType<T>();
...@@ -709,7 +1018,7 @@ protected: ...@@ -709,7 +1018,7 @@ protected:
const size_t local_batch_size = use_local_batch ? 3 : batch_size; const size_t local_batch_size = use_local_batch ? 3 : batch_size;
assert(batch_size % local_batch_size == 0); assert(batch_size % local_batch_size == 0);
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size, DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size, vocab_size,
end_id, end_id,
stream, stream,
...@@ -719,9 +1028,9 @@ protected: ...@@ -719,9 +1028,9 @@ protected:
&prop); // cuda_device_prop &prop); // cuda_device_prop
// Prepare decoding arguments // Prepare decoding arguments
const size_t random_seed_size = use_single_random_seed ? 1 : batch_size; const size_t random_seed_size = use_single_random_seed ? 1 : batch_size;
const size_t period_size = 3; const size_t period_size = 3;
unsigned long long* random_seed = new unsigned long long[random_seed_size]; unsigned long long* random_seed = new unsigned long long[random_seed_size];
for (size_t i = 0; i < random_seed_size; ++i) { for (size_t i = 0; i < random_seed_size; ++i) {
random_seed[i] = i / period_size; random_seed[i] = i / period_size;
} }
...@@ -739,29 +1048,27 @@ protected: ...@@ -739,29 +1048,27 @@ protected:
cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size); cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size);
for (uint ite = 0; ite < iteration_num; ++ite) { for (uint ite = 0; ite < iteration_num; ++ite) {
TensorMap dynamic_decode_input_tensors({ TensorMap dynamic_decode_input_tensors(
{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}}, {{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}}, {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}}, {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}}, {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}}, {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}}, {"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}}, {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}} {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}});
});
// common outputs // common outputs
TensorMap dynamic_decode_output_tensors({ TensorMap dynamic_decode_output_tensors(
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}}, {{"output_ids",
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}}, Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}} {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
}); {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
&dynamic_decode_input_tensors);
sync_check_cuda_error(); sync_check_cuda_error();
// check results. // check results.
...@@ -774,7 +1081,11 @@ protected: ...@@ -774,7 +1081,11 @@ protected:
for (size_t j = 1; j < period_size; ++j) { for (size_t j = 1; j < period_size; ++j) {
EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j]) EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j])
<< fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d", << fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d",
step, i, h_output_ids[i], i + j, h_output_ids[i + j]); step,
i,
h_output_ids[i],
i + j,
h_output_ids[i + j]);
} }
} }
} }
...@@ -783,11 +1094,12 @@ protected: ...@@ -783,11 +1094,12 @@ protected:
teardown(); teardown();
} }
void runCumLogProbTest(SamplingLayerTestParam param) { void runCumLogProbTest(SamplingLayerTestParam param)
{
setup(param); setup(param);
unsigned long long seed = 43; unsigned long long seed = 43;
const DataType data_type = getTensorType<T>(); const DataType data_type = getTensorType<T>();
DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size, DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
vocab_size, vocab_size,
end_id, end_id,
stream, stream,
...@@ -798,10 +1110,10 @@ protected: ...@@ -798,10 +1110,10 @@ protected:
// Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1. // Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1.
// T* h_logits = new T[batch_size * beam_width * vocab_size]; // T* h_logits = new T[batch_size * beam_width * vocab_size];
T* h_probs = new T[batch_size * beam_width * vocab_size]; T* h_probs = new T[batch_size * beam_width * vocab_size];
T* h_log_probs = new T[batch_size * beam_width * vocab_size]; T* h_log_probs = new T[batch_size * beam_width * vocab_size];
float* h_cum_log_probs = new float[batch_size * beam_width]; float* h_cum_log_probs = new float[batch_size * beam_width];
float* h_output_log_probs = new float[max_output_len * batch_size * beam_width]; float* h_output_log_probs = new float[max_output_len * batch_size * beam_width];
float* expected_cum_log_probs = new float[batch_size * beam_width]; float* expected_cum_log_probs = new float[batch_size * beam_width];
initRandom(h_logits, batch_size * beam_width * vocab_size, -3.0f, 3.0f); initRandom(h_logits, batch_size * beam_width * vocab_size, -3.0f, 3.0f);
computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size); computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size);
...@@ -810,10 +1122,11 @@ protected: ...@@ -810,10 +1122,11 @@ protected:
int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width)); int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width)); float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
float* output_log_probs = reinterpret_cast<float*>( float* output_log_probs =
allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width)); reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
int* output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width)); int* output_ids =
reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
int* h_output_ids = new int[batch_size * beam_width]; int* h_output_ids = new int[batch_size * beam_width];
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size)); int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
...@@ -824,65 +1137,64 @@ protected: ...@@ -824,65 +1137,64 @@ protected:
cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width); cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width); cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
TensorMap input_tensors({ TensorMap input_tensors({{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}}, {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}}, {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}, {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}}, {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors); dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
for (size_t step = max_input_len; step < max_output_len; ++step) { for (size_t step = max_input_len; step < max_output_len; ++step) {
uint ite = 0; uint ite = 0;
// Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob). // Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size); cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size);
TensorMap dynamic_decode_input_tensors({ TensorMap dynamic_decode_input_tensors(
{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}}, {{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
{"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}}, {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
{"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}}, {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}}, {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
{"input_lengths", {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}}, {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
{"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}}, {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
{"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}}, {"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
{"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}}, {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
{"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}}, {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
{"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}, {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
{"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}}, {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
{"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
});
// common outputs // common outputs
TensorMap dynamic_decode_output_tensors({ TensorMap dynamic_decode_output_tensors(
{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}}, {{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
{"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}}, {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
{"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}}, {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
{"output_log_probs", {"output_log_probs",
Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}}, Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
{"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}}); {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
dynamic_decode_layer->forward(&dynamic_decode_output_tensors, dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
&dynamic_decode_input_tensors);
TM_LOG_DEBUG("Step %2d generated ids", step); TM_LOG_DEBUG("Step %2d generated ids", step);
cudaD2Hcpy(h_output_ids, cudaD2Hcpy(
dynamic_decode_output_tensors h_output_ids,
.at("output_ids") dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset<int>(step * (batch_size * beam_width)),
.getPtrWithOffset<int>(step * (batch_size * beam_width)), batch_size * beam_width);
batch_size * beam_width);
cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width); cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width); cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
for (size_t i = 0; i < batch_size * beam_width; ++i) { for (size_t i = 0; i < batch_size * beam_width; ++i) {
int idx = i * vocab_size + h_output_ids[i]; int idx = i * vocab_size + h_output_ids[i];
expected_cum_log_probs[i] += (float)h_log_probs[idx]; expected_cum_log_probs[i] += (float)h_log_probs[idx];
TM_LOG_DEBUG( TM_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
"| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) " "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
"| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e", (int)step,
(int)step, (int)i, (int)idx, (int)h_output_ids[i], (int)i,
h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx], (int)idx,
h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]); (int)h_output_ids[i],
h_output_log_probs[step * batch_size * beam_width + i],
(float)h_log_probs[idx],
h_cum_log_probs[i],
expected_cum_log_probs[i],
(float)h_probs[idx]);
} }
TM_LOG_DEBUG(""); TM_LOG_DEBUG("");
} }
...@@ -898,7 +1210,6 @@ protected: ...@@ -898,7 +1210,6 @@ protected:
delete dynamic_decode_layer; delete dynamic_decode_layer;
} }
}; };
TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes); TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes);
......
#include <iostream> #include <iostream>
#include <vector>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -10,16 +10,17 @@ using namespace turbomind; ...@@ -10,16 +10,17 @@ using namespace turbomind;
namespace { namespace {
#define EXPECT_EQUAL_TENSORS(t1, t2) \ #define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \ do { \
EXPECT_TRUE(t1.where == t2.where); \ EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \ EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \ EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \ EXPECT_TRUE(t1.data == t2.data); \
} while(false) } while (false)
TEST(TensorMapTest, HasKeyCorrectness) { TEST(TensorMapTest, HasKeyCorrectness)
bool* v1 = new bool(true); {
bool* v1 = new bool(true);
float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
...@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) { ...@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
delete[] v2; delete[] v2;
} }
TEST(TensorMapTest, InsertCorrectness) { TEST(TensorMapTest, InsertCorrectness)
int* v1 = new int[4]{1, 10, 20, 30}; {
int* v1 = new int[4]{1, 10, 20, 30};
float* v2 = new float[2]{1.0f, 2.0f}; float* v2 = new float[2]{1.0f, 2.0f};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2); Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
...@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) { ...@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
EXPECT_FALSE(map.isExist("t2")); EXPECT_FALSE(map.isExist("t2"));
} }
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) { TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
{
TensorMap map; TensorMap map;
EXPECT_TRUE(map.size() == 0); EXPECT_TRUE(map.size() == 0);
// forbid a none tensor. // forbid a none tensor.
...@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) { ...@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error); EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
} }
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) { TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
int* v1 = new int[4]{1, 10, 20, 30}; {
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); int* v1 = new int[4]{1, 10, 20, 30};
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TensorMap map({{"t1", t1}}); TensorMap map({{"t1", t1}});
EXPECT_TRUE(map.size() == 1); EXPECT_TRUE(map.size() == 1);
// forbid a duplicated key. // forbid a duplicated key.
...@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) { ...@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
delete[] v1; delete[] v1;
} }
TEST(TensorMapTest, GetValCorrectness) { TEST(TensorMapTest, GetValCorrectness)
int* v1 = new int[4]{1, 10, 20, 30}; {
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
TensorMap map({{"t1", t1}}); TensorMap map({{"t1", t1}});
...@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) { ...@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
delete[] v1; delete[] v1;
} }
TEST(TensorMapTest, GetTensorCorrectness) { TEST(TensorMapTest, GetTensorCorrectness)
bool* t1_val = new bool(true); {
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3}; int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
TensorMap map({{"t1", t1}, {"t2", t2}}); TensorMap map({{"t1", t1}, {"t2", t2}});
...@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) { ...@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
delete[] t1_val; delete[] t1_val;
} }
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) { TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
bool* t1_val = new bool(true); {
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3}; int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
const TensorMap map({{"t1", t1}, {"t2", t2}}); const TensorMap map({{"t1", t1}, {"t2", t2}});
...@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) { ...@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
delete[] t1_val; delete[] t1_val;
} }
TEST(TensorTest, EmptyTensorMinMaxRaiseError) { TEST(TensorTest, EmptyTensorMinMaxRaiseError)
{
Tensor t1; Tensor t1;
EXPECT_THROW(t1.min<int>(), std::runtime_error); EXPECT_THROW(t1.min<int>(), std::runtime_error);
EXPECT_THROW(t1.max<int>(), std::runtime_error); EXPECT_THROW(t1.max<int>(), std::runtime_error);
...@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) { ...@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
EXPECT_THROW(t2.max<int>(), std::runtime_error); EXPECT_THROW(t2.max<int>(), std::runtime_error);
} }
using TensorTypes = testing::Types<int8_t, int, float>; using TensorTypes = testing::Types<int8_t, int, float>;
template <typename T> template<typename T>
class TensorFuncTest : public testing::Test {}; class TensorFuncTest: public testing::Test {};
TYPED_TEST_SUITE(TensorFuncTest, TensorTypes); TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
TYPED_TEST(TensorFuncTest, MaxCorrectness) { TYPED_TEST(TensorFuncTest, MaxCorrectness)
{
using T = TypeParam; using T = TypeParam;
size_t size = 4; size_t size = 4;
T* v1 = new T[size] {T(1), T(2), T(3), T(4)}; T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
T* v2 = new T[size] {T(4), T(3), T(2), T(1)}; T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
T* v3 = new T[size] {T(1), T(2), T(4), T(3)}; T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1); Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2); Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
...@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) { ...@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
delete[] v3; delete[] v3;
} }
TYPED_TEST(TensorFuncTest, MinCorrectness) { TYPED_TEST(TensorFuncTest, MinCorrectness)
{
using T = TypeParam; using T = TypeParam;
size_t size = 4; size_t size = 4;
...@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) { ...@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
delete[] v3; delete[] v3;
} }
TYPED_TEST(TensorFuncTest, AnyCorrectness) { TYPED_TEST(TensorFuncTest, AnyCorrectness)
{
using T = TypeParam; using T = TypeParam;
T* v = new T[4]{T(1), T(2), T(3), T(4)}; T* v = new T[4]{T(1), T(2), T(3), T(4)};
Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v}; Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
EXPECT_TRUE(t.any<T>(T(1))); EXPECT_TRUE(t.any<T>(T(1)));
EXPECT_FALSE(t.any<T>(T(5))); EXPECT_FALSE(t.any<T>(T(5)));
delete[] v; delete[] v;
} }
TYPED_TEST(TensorFuncTest, AllCorrectness) { TYPED_TEST(TensorFuncTest, AllCorrectness)
{
using T = TypeParam; using T = TypeParam;
constexpr size_t size = 4; constexpr size_t size = 4;
T* v1 = new T[size]{T(1), T(1), T(1), T(1)}; T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)}; T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1}; Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2}; Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
EXPECT_TRUE(t1.all<T>(T(1))); EXPECT_TRUE(t1.all<T>(T(1)));
EXPECT_FALSE(t2.all<T>(T(2))); EXPECT_FALSE(t2.all<T>(T(2)));
delete[] v1; delete[] v1;
delete[] v2; delete[] v2;
} }
TYPED_TEST(TensorFuncTest, SliceCorrectness) { TYPED_TEST(TensorFuncTest, SliceCorrectness)
{
using T = TypeParam; using T = TypeParam;
constexpr int size = 12; constexpr int size = 12;
T* v = new T[size]; T* v = new T[size];
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
v[i] = i; v[i] = i;
} }
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v); Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4); Tensor t2 = t1.slice({2, 4}, 4);
EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4])); EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
// An overflowed tensor throws an exception. // An overflowed tensor throws an exception.
...@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) { ...@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
delete[] v; delete[] v;
} }
} // end of namespace } // end of namespace
...@@ -16,15 +16,15 @@ ...@@ -16,15 +16,15 @@
#pragma once #pragma once
#include <algorithm> // min, max #include <algorithm> // min, max
#include <assert.h> // assert #include <assert.h> // assert
#include <float.h> // FLT_MAX #include <float.h> // FLT_MAX
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <limits> // numeric_limits
#include <limits> // numeric_limits #include <math.h> // expf, log
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // string #include <string> // string
#include <vector> // vector #include <vector> // vector
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
...@@ -36,32 +36,37 @@ ...@@ -36,32 +36,37 @@
using namespace turbomind; using namespace turbomind;
class TestFailureError : public std::exception { class TestFailureError: public std::exception {
private: private:
std::string msg_; std::string msg_;
public: public:
explicit TestFailureError() = default; explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") { explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str()); msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
} }
const char* what () const throw () { const char* what() const throw()
{
return msg_.c_str(); return msg_.c_str();
} }
}; };
#define EXPECT_TRUE(cond) \ #define EXPECT_TRUE(cond) \
do { if(!(cond)) { \ do { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ if (!(cond)) { \
__func__, #cond, __FILE__, __LINE__); \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } \
} while (false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \ #define EXPECT_FALSE(cond) \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ do { \
__func__, #cond, __FILE__, __LINE__); \ if (cond) { \
throw TestFailureError(__func__); \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
} } while(false) throw TestFailureError(__func__); \
} \
} while (false)
bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
{ {
...@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) ...@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
} }
template<typename T> template<typename T>
bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) { bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
size_t failures = 0; {
float relative_gap = 0.0f;; size_t failures = 0;
float relative_gap = 0.0f;
;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
// The values for the output and the reference. // The values for the output and the reference.
...@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size); size_t tol_failures = (size_t)(0.01 * size);
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)", TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(), failures <= tol_failures ? "....OK" : "FAILED",
100. * failures / size, atol, rtol, 100. * relative_gap); name.c_str(),
100. * failures / size,
atol,
rtol,
100. * relative_gap);
return failures <= tol_failures; return failures <= tol_failures;
} }
template<typename T> template<typename T>
bool checkResult(std::string name, T* out, T* ref, size_t size, bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
bool device_out = true, bool device_ref = false)
{ {
bool is_fp32 = sizeof(T) == 4; bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f; float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f; float rtol = is_fp32 ? 1e-2f : 1e-1f;
T* h_out = nullptr; T* h_out = nullptr;
if (device_out) { if (device_out) {
...@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size, ...@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
ref = h_ref; ref = h_ref;
} }
bool is_ok = checkResult(name, out, ref, size, atol, rtol); bool is_ok = checkResult(name, out, ref, size, atol, rtol);
if (h_out != nullptr){ if (h_out != nullptr) {
delete[] h_out; delete[] h_out;
} }
if (h_ref != nullptr) { if (h_ref != nullptr) {
...@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size, ...@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
} }
template<typename T> template<typename T>
void initRandom(T* ptr, size_t size, float minval, float maxval) { void initRandom(T* ptr, size_t size, float minval, float maxval)
{
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
val *= (maxval - minval); val *= (maxval - minval);
...@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) { ...@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
} }
} }
void initRandomInt(int* ptr, size_t size, int minval, int maxval) { void initRandomInt(int* ptr, size_t size, int minval, int maxval)
{
assert(minval < maxval); assert(minval < maxval);
int mod = maxval - minval; int mod = maxval - minval;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
...@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) { ...@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
} }
template<typename T> template<typename T>
void tile(T* x, int m, int n) { void tile(T* x, int m, int n)
{
for (int i = 1; i < m; ++i) { for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) { for (int j = 0; j < n; ++j) {
x[i * n + j] = x[j]; x[i * n + j] = x[j];
...@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) { ...@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
} }
template<typename T> template<typename T>
void tile(T* dst, T* src, int m, int n) { void tile(T* dst, T* src, int m, int n)
{
for (int i = 1; i < m; ++i) { for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) { for (int j = 0; j < n; ++j) {
dst[i * n + j] = src[j]; dst[i * n + j] = src[j];
...@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) { ...@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
#define HALF_FLT_MAX 65504.0f #define HALF_FLT_MAX 65504.0f
template<typename T> template<typename T>
bool isHalf() { bool isHalf()
{
return std::is_same<T, half>::value; return std::is_same<T, half>::value;
} }
template<typename T> template<typename T>
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) { static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr); printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment