Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
...@@ -24,6 +24,12 @@ ...@@ -24,6 +24,12 @@
namespace turbomind { namespace turbomind {
// cub.cuh brings windows.h
// should be included after cub.cuh
#ifdef ERROR
#undef ERROR
#endif
class Logger { class Logger {
public: public:
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
...@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve ...@@ -356,8 +357,8 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
} }
// get slices // get slices
ConcateSlice slice0{.slices = {{0, dim0}}}; ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{.slices = {{0, dim1}}}; ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) { if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0]; slice0 = slices[0];
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/macro.h"
#include <atomic> #include <atomic>
namespace turbomind { namespace turbomind {
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "nvtx_utils.h" #include "nvtx_utils.h"
#ifdef USE_NVTX #ifdef USE_NVTX
#include "nvToolsExt.h" #include "nvtx3/nvToolsExt.h"
#endif #endif
namespace ft_nvtx { namespace ft_nvtx {
......
...@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper( ...@@ -49,12 +49,12 @@ Tensor fused_gemm_dq_helper(
const T* scales_ptr = get_ptr<const T>(scales); const T* scales_ptr = get_ptr<const T>(scales);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor); T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor); char* ws_ptr = get_ptr<char>(ws_tensor);
cudaEvent_t start, stop; cudaEvent_t start, stop;
...@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper( ...@@ -258,12 +258,12 @@ Tensor fused_gemm_dq_bias_act_helper(
const T* bias_ptr = get_ptr<const T>(bias); const T* bias_ptr = get_ptr<const T>(bias);
turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner; turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
T* output_tensor_ptr = get_ptr<T>(output_tensor); T* output_tensor_ptr = get_ptr<T>(output_tensor);
char* ws_ptr = get_ptr<char>(ws_tensor); char* ws_ptr = get_ptr<char>(ws_tensor);
fused_gemm_dq_runner.gemm_bias_act(input_act_ptr, fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
* limitations under the License. * limitations under the License.
*/ */
#include <chrono>
#include <cstdlib>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <cstdlib>
#include <chrono>
#include "torch/csrc/cuda/Stream.h" #include "torch/csrc/cuda/Stream.h"
#include <torch/custom_class.h> #include <torch/custom_class.h>
...@@ -37,18 +37,17 @@ using torch_ext::get_ptr; ...@@ -37,18 +37,17 @@ using torch_ext::get_ptr;
namespace ft = turbomind; namespace ft = turbomind;
template<typename T> template<typename T>
void int8_gemm_test( void int8_gemm_test(const int m,
const int m, const int n,
const int n, const int k,
const int k, const at::ScalarType output_data_type,
const at::ScalarType output_data_type, const QuantMode quant_mode,
const QuantMode quant_mode, const int iters)
const int iters)
{ {
const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant const bool per_token_quant =
|| quant_mode == QuantMode::PerTokenQuant; quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant const bool per_channel_quant =
|| quant_mode == QuantMode::PerChannelQuant; quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
const int row_scale_size = per_token_quant ? m : 1; const int row_scale_size = per_token_quant ? m : 1;
const int col_scale_size = per_channel_quant ? n : 1; const int col_scale_size = per_channel_quant ? n : 1;
...@@ -76,16 +75,16 @@ void int8_gemm_test( ...@@ -76,16 +75,16 @@ void int8_gemm_test(
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy"); ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy"); ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
auto x_gpu = x.to(at_int8).to(torch::kCUDA); auto x_gpu = x.to(at_int8).to(torch::kCUDA);
auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous(); auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
auto w_gpu = w.to(at_int8).to(torch::kCUDA); auto w_gpu = w.to(at_int8).to(torch::kCUDA);
auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false)); auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false)); auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) * auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32)); * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) * auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32)); * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
auto alpha_row_torch = alpha_row_cultass.expand({m, 1}); auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
auto alpha_col_torch = alpha_col_cutlass.expand({1, n}); auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
...@@ -101,40 +100,41 @@ void int8_gemm_test( ...@@ -101,40 +100,41 @@ void int8_gemm_test(
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
// warm_up // warm_up
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu), cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu), get_ptr<int8_t>(w_T_gpu),
quant_mode, quant_mode,
get_ptr<float>(alpha_col_gpu), get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu), get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu), get_ptr<T>(y_gpu),
m, m,
n, n,
k, k,
nullptr, nullptr,
0, 0,
stream); stream);
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy"); ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
"y_gpu_int32.npy");
ft::check_cuda_error(cudaStreamSynchronize(stream)); ft::check_cuda_error(cudaStreamSynchronize(stream));
auto start = high_resolution_clock::now(); auto start = high_resolution_clock::now();
for (int i = 0; i < iters; ++i) { for (int i = 0; i < iters; ++i) {
cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu), cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
get_ptr<int8_t>(w_T_gpu), get_ptr<int8_t>(w_T_gpu),
quant_mode, quant_mode,
get_ptr<float>(alpha_col_gpu), get_ptr<float>(alpha_col_gpu),
get_ptr<float>(alpha_row_gpu), get_ptr<float>(alpha_row_gpu),
get_ptr<T>(y_gpu), get_ptr<T>(y_gpu),
m, m,
n, n,
k, k,
nullptr, nullptr,
0, 0,
stream); stream);
} }
ft::check_cuda_error(cudaStreamSynchronize(stream)); ft::check_cuda_error(cudaStreamSynchronize(stream));
...@@ -142,27 +142,30 @@ void int8_gemm_test( ...@@ -142,27 +142,30 @@ void int8_gemm_test(
auto duration = duration_cast<microseconds>(end - start); auto duration = duration_cast<microseconds>(end - start);
if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) { if (torch::allclose(
(y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
} else { }
else {
TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
// std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl; // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
// alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
} }
} }
int main(int argc, char **argv) int main(int argc, char** argv)
{ {
if (argc != 7) { if (argc != 7) {
TM_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters."); TM_LOG_ERROR(
"arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
return 0; return 0;
} }
const int m = atoi(argv[1]); const int m = atoi(argv[1]);
const int n = atoi(argv[2]); const int n = atoi(argv[2]);
const int k = atoi(argv[3]); const int k = atoi(argv[3]);
const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
at::ScalarType::Half : at::ScalarType::BFloat16; const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
if (quant_mode == QuantMode::PerChannelQuant) { if (quant_mode == QuantMode::PerChannelQuant) {
printf("per channel quant \n"); printf("per channel quant \n");
} }
...@@ -170,7 +173,8 @@ int main(int argc, char **argv) ...@@ -170,7 +173,8 @@ int main(int argc, char **argv)
if (output_data_type == at::ScalarType::Half) { if (output_data_type == at::ScalarType::Half) {
int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters); int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
} else { }
else {
#if ENABLE_BF16 #if ENABLE_BF16
int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters); int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
#endif #endif
......
...@@ -20,7 +20,12 @@ FetchContent_Declare( ...@@ -20,7 +20,12 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/google/googletest.git GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.1 GIT_TAG release-1.12.1
) )
add_definitions(-DTORCH_CUDA=1)
find_package(CUDAToolkit REQUIRED)
if (NOT MSVC)
add_definitions(-DTORCH_CUDA=1)
endif()
# For Windows: Prevent overriding the parent project's compiler/linker settings # For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
...@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14) ...@@ -41,23 +46,23 @@ target_compile_features(unittest PRIVATE cxx_std_14)
# Sorted by alphabetical order of test name. # Sorted by alphabetical order of test name.
target_link_libraries( # Libs for test_attention_kernels target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC unittest PUBLIC
-lcudart -lcurand CUDA::cudart CUDA::curand
gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger) gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC unittest PUBLIC
-lcudart CUDA::cudart
logprob_kernels memory_utils cuda_utils logger) logprob_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_penalty_kernels target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC unittest PUBLIC
-lcublas -lcublasLt -lcudart CUDA::cublas CUDA::cublasLt CUDA::cudart
sampling_penalty_kernels memory_utils cuda_utils logger) sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC unittest PUBLIC
-lcudart CUDA::cudart
sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger) sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
target_link_libraries( # Libs for test_sampling_layer target_link_libraries( # Libs for test_sampling_layer
unittest PUBLIC unittest PUBLIC
-lcublas -lcublasLt -lcudart CUDA::cublas CUDA::cublasLt CUDA::cudart
cublasMMWrapper memory_utils cublasMMWrapper memory_utils
DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger) DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
target_link_libraries( # Libs for test_tensor target_link_libraries( # Libs for test_tensor
...@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor ...@@ -65,7 +70,7 @@ target_link_libraries( # Libs for test_tensor
remove_definitions(-DTORCH_CUDA=1) remove_definitions(-DTORCH_CUDA=1)
add_executable(test_gemm test_gemm.cu) add_executable(test_gemm test_gemm.cu)
target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger) target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
add_executable(test_gpt_kernels test_gpt_kernels.cu) add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC target_link_libraries(test_gpt_kernels PUBLIC
...@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC ...@@ -73,6 +78,6 @@ target_link_libraries(test_gpt_kernels PUBLIC
add_executable(test_context_attention_layer test_context_attention_layer.cu) add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart Llama CUDA::cublas CUDA::cublasLt CUDA::cudart
unfused_attention_kernels unfused_attention_kernels
memory_utils tensor cublasMMWrapper cuda_utils logger) memory_utils tensor cublasMMWrapper cuda_utils logger)
...@@ -14,13 +14,12 @@ ...@@ -14,13 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "gtest_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "gtest_utils.h"
#include <curand.h> #include <curand.h>
#include <sstream> #include <sstream>
......
...@@ -336,35 +336,26 @@ int main(int argc, const char* argv[]) ...@@ -336,35 +336,26 @@ int main(int argc, const char* argv[])
// compute actual // compute actual
using AttentionOp = FlashAttentionOp<scalar_t>; using AttentionOp = FlashAttentionOp<scalar_t>;
using Layout = typename AttentionOp::AttentionLayout; using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = num_heads * seq_len * size_per_head, Layout layout_q{num_heads * seq_len * size_per_head, size_per_head, seq_len * size_per_head};
.stride_seq = size_per_head, Layout layout_k{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
.stride_head = seq_len * size_per_head}; Layout layout_v{num_heads * key_len * size_per_head, size_per_head, key_len * size_per_head};
Layout layout_k{.stride_batch = num_heads * key_len * size_per_head, Layout layout_o{num_heads * seq_len * size_per_head, num_heads * size_per_head, size_per_head, true};
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_v{.stride_batch = num_heads * key_len * size_per_head,
.stride_seq = size_per_head,
.stride_head = key_len * size_per_head};
Layout layout_o{.stride_batch = num_heads * seq_len * size_per_head,
.stride_seq = num_heads * size_per_head,
.stride_head = size_per_head,
.use_seqlens = true};
AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head); AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true); float* accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
typename AttentionOp::Params attn_params{.attn_out = actual_out_ptr, typename AttentionOp::Params attn_params{actual_out_ptr,
.query = query_ptr, query_ptr,
.key = key_ptr, key_ptr,
.val = val_ptr, val_ptr,
.mask = mask_ptr, mask_ptr,
.out_accum = accum_buf_ptr, accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr, cu_seqlens_ptr,
.cu_seqlens_k = nullptr, nullptr,
.group_size = 1, 1,
.layout_q = layout_q, layout_q,
.layout_k = layout_k, layout_k,
.layout_v = layout_v, layout_v,
.layout_o = layout_o}; layout_o};
flash_attention(attn_params, stream); flash_attention(attn_params, stream);
sync_check_cuda_error(); sync_check_cuda_error();
......
This diff is collapsed.
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
...@@ -39,13 +39,14 @@ protected: ...@@ -39,13 +39,14 @@ protected:
void testTransposition(); void testTransposition();
}; };
void fill_tensor_random(Tensor a) { void fill_tensor_random(Tensor a)
const size_t num_elems = a.size(); {
std::vector<int8_t> host_values(num_elems); const size_t num_elems = a.size();
std::vector<int8_t> host_values(num_elems);
std::uniform_int_distribution<int8_t> int8_random(-128, 127); std::uniform_int_distribution<int8_t> int8_random(-128, 127);
std::mt19937 rng(0); std::mt19937 rng(0);
std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); }); std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems); cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
} }
...@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition() ...@@ -70,11 +71,11 @@ void Int8TestSuite::testTransposition()
int8_t *a_data, *a_t_data; int8_t *a_data, *a_t_data;
cudaMalloc(&a_data, m * k * sizeof(int8_t)); cudaMalloc(&a_data, m * k * sizeof(int8_t));
Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data}; Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
fill_tensor_random(a); fill_tensor_random(a);
cudaMalloc(&a_t_data, k * m * sizeof(int8_t)); cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data}; Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
std::vector<int8_t> a_t_host_ref(a_t.size()); std::vector<int8_t> a_t_host_ref(a_t.size());
reference_transpose_host(a_t_host_ref, a); reference_transpose_host(a_t_host_ref, a);
......
#include <assert.h> #include <assert.h>
#include <math.h>
#include <float.h> #include <float.h>
#include <math.h>
#include <stdexcept> #include <stdexcept>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#endif
#include "src/turbomind/kernels/logprob_kernels.h" #include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
...@@ -24,22 +25,26 @@ struct LogProbKernelTestParam { ...@@ -24,22 +25,26 @@ struct LogProbKernelTestParam {
size_t vocab_size; size_t vocab_size;
size_t beam_width; size_t beam_width;
std::string toString() { std::string toString()
{
return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]", return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
max_input_length, batch_size, vocab_size, beam_width); max_input_length,
batch_size,
vocab_size,
beam_width);
} }
}; };
/////////////////////////////////// Unittests ////////////////////////////////////////// /////////////////////////////////// Unittests //////////////////////////////////////////
template<typename T> template<typename T>
class LogProbKernelTest : public FtTestBase { class LogProbKernelTest: public FtTestBase {
protected: protected:
void computeCumLogProbs(float* cum_log_probs, void computeCumLogProbs(float* cum_log_probs,
float* log_probs, float* log_probs,
const T* logits, const T* logits,
const int* input_ids, const int* input_ids,
const int* input_lengths, const int* input_lengths,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
const size_t vocab_size, const size_t vocab_size,
...@@ -54,9 +59,9 @@ protected: ...@@ -54,9 +59,9 @@ protected:
cum_log_probs[i] = 0.0f; cum_log_probs[i] = 0.0f;
} }
else if ((int)step < input_lengths[i]) { else if ((int)step < input_lengths[i]) {
size_t step_offset = (step - 1) * batch_size * vocab_size_padded; size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
const T* vec = logits + step_offset + i * vocab_size_padded; const T* vec = logits + step_offset + i * vocab_size_padded;
float max_logits = -FLT_MAX; float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]); float val = static_cast<float>(vec[v]);
if (val > max_logits) { if (val > max_logits) {
...@@ -67,7 +72,7 @@ protected: ...@@ -67,7 +72,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits); sum += expf(static_cast<float>(vec[v]) - max_logits);
} }
int token_id = input_ids[step * batch_size + i]; int token_id = input_ids[step * batch_size + i];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum); float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) { if (log_probs != nullptr) {
log_probs[step * batch_size + i] = log_prob; log_probs[step * batch_size + i] = log_prob;
...@@ -78,11 +83,11 @@ protected: ...@@ -78,11 +83,11 @@ protected:
} }
} }
void computeCumLogProbsBatchFirst(float* cum_log_probs, void computeCumLogProbsBatchFirst(float* cum_log_probs,
float* log_probs, float* log_probs,
const T* logits, const T* logits,
const int* input_ids, const int* input_ids,
const int* input_lengths, const int* input_lengths,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
const size_t vocab_size, const size_t vocab_size,
...@@ -98,8 +103,8 @@ protected: ...@@ -98,8 +103,8 @@ protected:
cum_log_probs[i] = 0.0f; cum_log_probs[i] = 0.0f;
} }
else if ((int)step < input_lengths[i]) { else if ((int)step < input_lengths[i]) {
const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded; const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
float max_logits = -FLT_MAX; float max_logits = -FLT_MAX;
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
float val = static_cast<float>(vec[v]); float val = static_cast<float>(vec[v]);
if (val > max_logits) { if (val > max_logits) {
...@@ -110,7 +115,7 @@ protected: ...@@ -110,7 +115,7 @@ protected:
for (size_t v = 0; v < vocab_size; ++v) { for (size_t v = 0; v < vocab_size; ++v) {
sum += expf(static_cast<float>(vec[v]) - max_logits); sum += expf(static_cast<float>(vec[v]) - max_logits);
} }
int token_id = input_ids[i * max_input_length + step]; int token_id = input_ids[i * max_input_length + step];
float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum); float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
if (log_probs != nullptr) { if (log_probs != nullptr) {
log_probs[i * max_input_length + step] = log_prob; log_probs[i * max_input_length + step] = log_prob;
...@@ -122,17 +127,17 @@ protected: ...@@ -122,17 +127,17 @@ protected:
} }
public: public:
void runTest(LogProbKernelTestParam param)
void runTest(LogProbKernelTestParam param) { {
size_t max_input_length = param.max_input_length; size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width; size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does. // Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8); size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values // input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size]; T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
int* h_input_ids = new int[max_input_length * batchxbeam]; int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam]; int* h_input_lengths = new int[batchxbeam];
// output buffers // output buffers
...@@ -145,9 +150,9 @@ public: ...@@ -145,9 +150,9 @@ public:
memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam); memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
// device buffers // device buffers
T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size)); T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam)); int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam)); float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers // initialize device buffers
...@@ -157,7 +162,7 @@ public: ...@@ -157,7 +162,7 @@ public:
deviceFill(d_cum_log_probs, batchxbeam, 0.0f); deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam; size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size); void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs, invokeLogProbFromLogits(d_cum_log_probs,
d_logits, d_logits,
d_input_ids, d_input_ids,
...@@ -189,16 +194,17 @@ public: ...@@ -189,16 +194,17 @@ public:
delete[] h_logits; delete[] h_logits;
} }
void runBatchFirstTest(LogProbKernelTestParam param) { void runBatchFirstTest(LogProbKernelTestParam param)
{
size_t max_input_length = param.max_input_length; size_t max_input_length = param.max_input_length;
size_t batchxbeam = param.batch_size * param.beam_width; size_t batchxbeam = param.batch_size * param.beam_width;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
// Make multiple of 8 as GPT does. // Make multiple of 8 as GPT does.
size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8); size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
// input values // input values
T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded]; T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
int* h_input_ids = new int[max_input_length * batchxbeam]; int* h_input_ids = new int[max_input_length * batchxbeam];
int* h_input_lengths = new int[batchxbeam]; int* h_input_lengths = new int[batchxbeam];
// output buffers // output buffers
...@@ -213,8 +219,8 @@ public: ...@@ -213,8 +219,8 @@ public:
// device buffers // device buffers
T* d_logits = T* d_logits =
reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded)); reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam)); int* d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam)); int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam)); float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
// initialize device buffers // initialize device buffers
...@@ -224,7 +230,7 @@ public: ...@@ -224,7 +230,7 @@ public:
check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam)); check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
size_t workspace_size = sizeof(float) * max_input_length * batchxbeam; size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
void* workspace = allocator->malloc(workspace_size); void* workspace = allocator->malloc(workspace_size);
invokeLogProbFromLogits(d_cum_log_probs, invokeLogProbFromLogits(d_cum_log_probs,
d_logits, d_logits,
d_input_ids, d_input_ids,
...@@ -239,16 +245,16 @@ public: ...@@ -239,16 +245,16 @@ public:
true); true);
computeCumLogProbsBatchFirst(expected_cum_log_probs, computeCumLogProbsBatchFirst(expected_cum_log_probs,
nullptr, nullptr,
h_logits, h_logits,
h_input_ids, h_input_ids,
h_input_lengths, h_input_lengths,
max_input_length, max_input_length,
batchxbeam, batchxbeam,
vocab_size, vocab_size,
vocab_size_padded); vocab_size_padded);
std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)"); std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam); bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
delete[] expected_cum_log_probs; delete[] expected_cum_log_probs;
...@@ -256,10 +262,8 @@ public: ...@@ -256,10 +262,8 @@ public:
delete[] h_input_ids; delete[] h_input_ids;
delete[] h_logits; delete[] h_logits;
} }
}; };
TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes); TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
TYPED_TEST(LogProbKernelTest, SingleStep) TYPED_TEST(LogProbKernelTest, SingleStep)
......
...@@ -14,24 +14,24 @@ ...@@ -14,24 +14,24 @@
* limitations under the License. * limitations under the License.
*/ */
#include <algorithm> // std::min, std::max #include <algorithm> // std::min, std::max
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <math.h> // expf, log
#include <stdexcept> #include <stdexcept>
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // std::string #include <string> // std::string
#include <unordered_map> #include <unordered_map>
#include <vector> // std::vector #include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "gtest_utils.h"
#include "src/turbomind/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
using namespace turbomind; using namespace turbomind;
...@@ -41,21 +41,25 @@ struct TemperatureTestParam { ...@@ -41,21 +41,25 @@ struct TemperatureTestParam {
float* temperatures; float* temperatures;
size_t temperatures_size; size_t temperatures_size;
std::string toString() { std::string toString()
{
return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]", return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str()); batch_size,
vocab_size,
arr2str(temperatures, temperatures_size).c_str());
} }
}; };
size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) { size_t pad_vocab_size(size_t vocab_size, size_t pad = 8)
{
return (vocab_size + pad - 1) / pad * pad; return (vocab_size + pad - 1) / pad * pad;
} }
template<typename T> template<typename T>
void applyRepetitonPenalty(T* logits, void applyRepetitonPenalty(T* logits,
const int* output_ids, const int* output_ids,
const int* input_lengths, const int* input_lengths,
const float repetition_penalty, const float repetition_penalty,
const size_t step, const size_t step,
const size_t max_input_length, const size_t max_input_length,
const size_t batch_size, const size_t batch_size,
...@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits, ...@@ -74,8 +78,8 @@ void applyRepetitonPenalty(T* logits,
int token_id = output_ids[i + t * batch_size]; int token_id = output_ids[i + t * batch_size];
if (!penalized[token_id]) { if (!penalized[token_id]) {
float logit = static_cast<float>(logits[offset + token_id]); float logit = static_cast<float>(logits[offset + token_id]);
logits[offset + token_id] = static_cast<T>(logit < 0.0f ? logits[offset + token_id] =
logit * repetition_penalty : logit / repetition_penalty); static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
penalized[token_id] = true; penalized[token_id] = true;
} }
} }
...@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits, ...@@ -84,9 +88,9 @@ void applyRepetitonPenalty(T* logits,
} }
template<typename T> template<typename T>
void batchApplyRepetitonPenalty(T* logits, void batchApplyRepetitonPenalty(T* logits,
const int* output_ids, const int* output_ids,
const int* input_lengths, const int* input_lengths,
const float* repetition_penalties, const float* repetition_penalties,
const size_t step, const size_t step,
const size_t max_input_length, const size_t max_input_length,
...@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits, ...@@ -116,11 +120,8 @@ void batchApplyRepetitonPenalty(T* logits,
} }
template<typename T> template<typename T>
void initLogitsAndBias(T* logits, void initLogitsAndBias(
T* bias, T* logits, T* bias, const size_t batch_size, const size_t vocab_size, const size_t vocab_size_padded)
const size_t batch_size,
const size_t vocab_size,
const size_t vocab_size_padded)
{ {
initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f); initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
if (bias != nullptr) { if (bias != nullptr) {
...@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits, ...@@ -139,11 +140,10 @@ void initLogitsAndBias(T* logits,
} }
} }
/////////////////////////////////// Tests ////////////////////////////////////////// /////////////////////////////////// Tests //////////////////////////////////////////
template<typename T> template<typename T>
class TemperaturePenaltyTest : public FtTestBase { class TemperaturePenaltyTest: public FtTestBase {
protected: protected:
// Set up test // Set up test
size_t batch_size_; size_t batch_size_;
...@@ -157,17 +157,18 @@ protected: ...@@ -157,17 +157,18 @@ protected:
float* d_temperatures_; float* d_temperatures_;
void subsetup(TemperatureTestParam param) { void subsetup(TemperatureTestParam param)
batch_size_ = param.batch_size; {
vocab_size_ = param.vocab_size; batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_); vocab_size_padded_ = pad_vocab_size(vocab_size_);
h_logits_ = new T[batch_size_ * vocab_size_padded_]; h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_]; h_bias_ = new T[vocab_size_padded_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_); initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream); cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
if (param.temperatures_size > 1) { if (param.temperatures_size > 1) {
...@@ -177,7 +178,8 @@ protected: ...@@ -177,7 +178,8 @@ protected:
} }
} }
void subteardown() { void subteardown()
{
delete[] h_logits_; delete[] h_logits_;
delete[] h_bias_; delete[] h_bias_;
} }
...@@ -195,7 +197,7 @@ protected: ...@@ -195,7 +197,7 @@ protected:
ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature; ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
for (size_t j = 0; j < vocab_size; ++j) { for (size_t j = 0; j < vocab_size; ++j) {
size_t index = i * vocab_size_padded + j; size_t index = i * vocab_size_padded + j;
float logit = static_cast<float>(logits[index]); float logit = static_cast<float>(logits[index]);
if (bias != nullptr) { if (bias != nullptr) {
logit += static_cast<float>(bias[j]); logit += static_cast<float>(bias[j]);
} }
...@@ -204,29 +206,18 @@ protected: ...@@ -204,29 +206,18 @@ protected:
} }
} }
public: public:
void runTest(TemperatureTestParam param) void runTest(TemperatureTestParam param)
{ {
subsetup(param); subsetup(param);
// Do test // Do test
if (param.temperatures_size == 1) { if (param.temperatures_size == 1) {
invokeApplyTemperaturePenalty(d_logits_, invokeApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
param.temperatures[0],
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
} }
else { else {
invokeBatchApplyTemperaturePenalty(d_logits_, invokeBatchApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
d_temperatures_,
batch_size_,
vocab_size_,
vocab_size_padded_,
stream);
} }
computeReference(h_logits_, computeReference(h_logits_,
h_bias_, h_bias_,
...@@ -240,21 +231,17 @@ public: ...@@ -240,21 +231,17 @@ public:
subteardown(); subteardown();
} }
void runConsistencyTest(TemperatureTestParam param) { void runConsistencyTest(TemperatureTestParam param)
{
// Set up test // Set up test
ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1"; ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
subsetup(param); subsetup(param);
// Run a single runtime value case. // Run a single runtime value case.
invokeApplyTemperaturePenalty(d_logits_, invokeApplyTemperaturePenalty(
d_bias_, d_logits_, d_bias_, param.temperatures[0], batch_size_, vocab_size_, vocab_size_padded_, stream);
param.temperatures[0],
batch_size_, float temperature = param.temperatures[0];
vocab_size_,
vocab_size_padded_,
stream);
float temperature = param.temperatures[0];
float* h_temperatures = new float[batch_size_]; float* h_temperatures = new float[batch_size_];
for (size_t i = 0; i < batch_size_; ++i) { for (size_t i = 0; i < batch_size_; ++i) {
h_temperatures[i] = temperature; h_temperatures[i] = temperature;
...@@ -263,18 +250,14 @@ public: ...@@ -263,18 +250,14 @@ public:
cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream); cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream); cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
invokeBatchApplyTemperaturePenalty(d_logits_batch, invokeBatchApplyTemperaturePenalty(
d_bias_batch, d_logits_batch, d_bias_batch, d_temperatures_, batch_size_, vocab_size_, vocab_size_padded_, stream);
d_temperatures_, bool passed =
batch_size_, checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
vocab_size_,
vocab_size_padded_,
stream);
bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
// Tear down test // Tear down test
...@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab) ...@@ -315,7 +298,7 @@ TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty) TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 1.0f; temperatures[i] = 1.0f;
...@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty) ...@@ -325,7 +308,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne) TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 0.53f; temperatures[i] = 0.53f;
...@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne) ...@@ -335,7 +318,7 @@ TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne) TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f; temperatures[i] = 2.01f;
...@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne) ...@@ -345,10 +328,10 @@ TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(TemperaturePenaltyTest, BatchMixed) TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f; temperatures[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, temperatures, batch_size}); this->runTest({batch_size, 4, temperatures, batch_size});
} }
...@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase { ...@@ -367,22 +350,24 @@ struct RepetitionPenaltyTestCase {
size_t repetition_penalties_size; size_t repetition_penalties_size;
RepetitionPenaltyType repetition_penalty_type; RepetitionPenaltyType repetition_penalty_type;
std::string toString() { std::string toString()
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map { {
static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map{
{RepetitionPenaltyType::Additive, "additive"}, {RepetitionPenaltyType::Additive, "additive"},
{RepetitionPenaltyType::Multiplicative, "multiplicative"}, {RepetitionPenaltyType::Multiplicative, "multiplicative"},
{RepetitionPenaltyType::None, "none"}}; {RepetitionPenaltyType::None, "none"}};
return fmtstr( return fmtstr("RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
"RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, " "repetition_penalties=%s, repetition_penalty_type=%s]",
"repetition_penalties=%s, repetition_penalty_type=%s]", batch_size,
batch_size, vocab_size, max_input_length, vocab_size,
arr2str(repetition_penalties, repetition_penalties_size).c_str(), max_input_length,
typestr_map.at(repetition_penalty_type).c_str()); arr2str(repetition_penalties, repetition_penalties_size).c_str(),
typestr_map.at(repetition_penalty_type).c_str());
} }
}; };
template<typename T> template<typename T>
class RepetitionPenaltyTest : public FtTestBase { class RepetitionPenaltyTest: public FtTestBase {
protected: protected:
// Set up test // Set up test
size_t batch_size_; size_t batch_size_;
...@@ -392,37 +377,38 @@ protected: ...@@ -392,37 +377,38 @@ protected:
size_t sequence_length_; size_t sequence_length_;
size_t step_; size_t step_;
T* h_logits_; T* h_logits_;
T* h_bias_; T* h_bias_;
int* h_output_ids_; int* h_output_ids_;
int* h_input_lengths_; int* h_input_lengths_;
T* d_logits_; T* d_logits_;
T* d_bias_; T* d_bias_;
int* d_output_ids_; int* d_output_ids_;
int* d_input_lengths_; int* d_input_lengths_;
float* d_repetition_penalties_; float* d_repetition_penalties_;
void subsetup(RepetitionPenaltyTestCase param) { void subsetup(RepetitionPenaltyTestCase param)
batch_size_ = param.batch_size; {
vocab_size_ = param.vocab_size; batch_size_ = param.batch_size;
vocab_size_ = param.vocab_size;
vocab_size_padded_ = pad_vocab_size(vocab_size_); vocab_size_padded_ = pad_vocab_size(vocab_size_);
max_input_length_ = param.max_input_length; max_input_length_ = param.max_input_length;
sequence_length_ = 2 * max_input_length_; // input + output sequence_length_ = 2 * max_input_length_; // input + output
step_ = sequence_length_ * 0.7; step_ = sequence_length_ * 0.7;
h_logits_ = new T[batch_size_ * vocab_size_padded_]; h_logits_ = new T[batch_size_ * vocab_size_padded_];
h_bias_ = new T[vocab_size_padded_]; h_bias_ = new T[vocab_size_padded_];
h_output_ids_ = new int[sequence_length_ * batch_size_]; h_output_ids_ = new int[sequence_length_ * batch_size_];
h_input_lengths_ = new int[batch_size_]; h_input_lengths_ = new int[batch_size_];
initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_); initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_); initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_); initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_)); d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_)); d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_)); d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_)); d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream); cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
...@@ -437,7 +423,8 @@ protected: ...@@ -437,7 +423,8 @@ protected:
} }
} }
void subteardown() { void subteardown()
{
delete[] h_logits_; delete[] h_logits_;
delete[] h_bias_; delete[] h_bias_;
delete[] h_output_ids_; delete[] h_output_ids_;
...@@ -540,7 +527,8 @@ public: ...@@ -540,7 +527,8 @@ public:
subteardown(); subteardown();
} }
void runConsistencyTest(RepetitionPenaltyTestCase param) { void runConsistencyTest(RepetitionPenaltyTestCase param)
{
// Set up test // Set up test
ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1"; ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
subsetup(param); subsetup(param);
...@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab) ...@@ -618,7 +606,7 @@ TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty) TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 1.0f; repetition_penalties[i] = 1.0f;
...@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty) ...@@ -628,7 +616,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne) TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = 0.53f; repetition_penalties[i] = 0.53f;
...@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne) ...@@ -638,7 +626,7 @@ TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne) TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* temperatures = new float[batch_size]; float* temperatures = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
temperatures[i] = 2.01f; temperatures[i] = 2.01f;
...@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne) ...@@ -648,10 +636,10 @@ TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
TYPED_TEST(RepetitionPenaltyTest, BatchMixed) TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f; repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
} }
...@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency) ...@@ -664,10 +652,10 @@ TYPED_TEST(RepetitionPenaltyTest, Consistency)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive) TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f; repetition_penalties[i] = i % 2 == 0 ? 2.01f : 0.53f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
} }
...@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero) ...@@ -680,10 +668,10 @@ TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2) TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
{ {
size_t batch_size = 6; size_t batch_size = 6;
float* repetition_penalties = new float[batch_size]; float* repetition_penalties = new float[batch_size];
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f; repetition_penalties[i] = i % 2 == 0 ? 1.0f : 0.0f;
} }
this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive}); this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
......
#include <algorithm> // std::fill_n #include <algorithm> // std::fill_n
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <math.h> // expf, log
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // std::string #include <string> // std::string
#include <vector> // std::vector #include <vector> // std::vector
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
...@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size) ...@@ -68,9 +69,9 @@ void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval); sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval; float logit = static_cast<float>(logits[idx]) - maxval;
probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON)); probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
} }
} }
} }
...@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) ...@@ -96,8 +97,8 @@ void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval); sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
} }
for (int i = 0; i < vocab_size; ++i) { for (int i = 0; i < vocab_size; ++i) {
int idx = bidx * vocab_size + i; int idx = bidx * vocab_size + i;
float logit = static_cast<float>(logits[idx]) - maxval; float logit = static_cast<float>(logits[idx]) - maxval;
logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON)); logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
} }
} }
...@@ -119,10 +120,10 @@ public: ...@@ -119,10 +120,10 @@ public:
} }
protected: protected:
unsigned long long seed = 0; unsigned long long seed = 0;
cudaStream_t stream; cudaStream_t stream;
Allocator<AllocatorType::CUDA>* allocator; Allocator<AllocatorType::CUDA>* allocator;
curandState_t* curand_states; curandState_t* curand_states;
}; };
template<typename T> template<typename T>
...@@ -393,8 +394,8 @@ public: ...@@ -393,8 +394,8 @@ public:
{ {
this->runBatchTest(param, false, false); this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true); this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false); this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true); this->runBatchTest(param, true, true);
} }
}; };
...@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral) ...@@ -410,7 +411,6 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
this->runTest({6, 4, 1, 4, 1.0f, 1}); this->runTest({6, 4, 1, 4, 1.0f, 1});
}; };
TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63) TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
{ {
this->runTest({16, 51200, 1, 63, 1.0f, 8}); this->runTest({16, 51200, 1, 63, 1.0f, 8});
...@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP) ...@@ -456,7 +456,6 @@ TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
this->runBatchTest({8, 4000, 1, 63, 0.3f, 8}); this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
}; };
template<typename T> template<typename T>
class TopPSamplingKernelTest: public SamplingKernelTest<T> { class TopPSamplingKernelTest: public SamplingKernelTest<T> {
...@@ -473,7 +472,7 @@ public: ...@@ -473,7 +472,7 @@ public:
size_t batch_size = param.batch_size; size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
size_t output_len = param.output_len; size_t output_len = param.output_len;
size_t seq_len = output_len; size_t seq_len = output_len;
float top_p = param.top_p; float top_p = param.top_p;
...@@ -496,8 +495,8 @@ public: ...@@ -496,8 +495,8 @@ public:
struct cudaDeviceProp device_prop; struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device); cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>( curandState_t* curand_states =
allocator->malloc(sizeof(curandState_t) * batch_size, false)); reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream); invokeCurandInitialize(curand_states, batch_size, seed, stream);
int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size)); int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
...@@ -515,17 +514,17 @@ public: ...@@ -515,17 +514,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1))); int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size)); int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0; size_t workspace_size = 0;
size_t cub_temp_storage_size = 0; size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel. // retrieve the workspace size of the top-p sampling kernel.
invokeTopPSampling<T>(nullptr, // workspace invokeTopPSampling<T>(nullptr, // workspace
workspace_size, workspace_size,
cub_temp_storage_size, cub_temp_storage_size,
nullptr, // output_ids nullptr, // output_ids
nullptr, // sequence_length nullptr, // sequence_length
nullptr, // finished_buffer nullptr, // finished_buffer
nullptr, // cum_log_probs nullptr, // cum_log_probs
nullptr, // output_log_probs nullptr, // output_log_probs
(T*)nullptr, // log_probs (T*)nullptr, // log_probs
topp_id_vals_buf, topp_id_vals_buf,
end_offsets, end_offsets,
...@@ -553,12 +552,7 @@ public: ...@@ -553,12 +552,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size); computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size); cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf, invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeTopPSampling<T>(workspace, invokeTopPSampling<T>(workspace,
workspace_size, workspace_size,
...@@ -612,7 +606,7 @@ public: ...@@ -612,7 +606,7 @@ public:
size_t batch_size = param.batch_size; size_t batch_size = param.batch_size;
size_t vocab_size = param.vocab_size; size_t vocab_size = param.vocab_size;
float top_p = param.top_p; float top_p = param.top_p;
float* h_top_ps = new float[batch_size]; float* h_top_ps = new float[batch_size];
// Initialize runtime top k values. // Initialize runtime top k values.
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
...@@ -621,7 +615,7 @@ public: ...@@ -621,7 +615,7 @@ public:
float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size); float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
size_t output_len = param.output_len; size_t output_len = param.output_len;
size_t seq_len = output_len; size_t seq_len = output_len;
// Logit values in the host of shape (batch_size x vocab_size). // Logit values in the host of shape (batch_size x vocab_size).
T* h_logits = new T[batch_size * vocab_size]; T* h_logits = new T[batch_size * vocab_size];
...@@ -647,8 +641,8 @@ public: ...@@ -647,8 +641,8 @@ public:
struct cudaDeviceProp device_prop; struct cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device); cudaGetDeviceProperties(&device_prop, device);
curandState_t* curand_states = reinterpret_cast<curandState_t*>( curandState_t* curand_states =
allocator->malloc(sizeof(curandState_t) * batch_size, false)); reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
invokeCurandInitialize(curand_states, batch_size, seed, stream); invokeCurandInitialize(curand_states, batch_size, seed, stream);
float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size)); float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
...@@ -668,17 +662,17 @@ public: ...@@ -668,17 +662,17 @@ public:
int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1))); int* end_offsets = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size)); int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
size_t workspace_size = 0; size_t workspace_size = 0;
size_t cub_temp_storage_size = 0; size_t cub_temp_storage_size = 0;
// retrieve the workspace size of the top-p sampling kernel. // retrieve the workspace size of the top-p sampling kernel.
invokeBatchTopPSampling<T>(nullptr, // workspace invokeBatchTopPSampling<T>(nullptr, // workspace
workspace_size, workspace_size,
cub_temp_storage_size, cub_temp_storage_size,
nullptr, // output_ids nullptr, // output_ids
nullptr, // sequence_length nullptr, // sequence_length
nullptr, // finished_buffer nullptr, // finished_buffer
nullptr, // cum_log_probs nullptr, // cum_log_probs
nullptr, // output_log_probs nullptr, // output_log_probs
(T*)nullptr, // log_probs (T*)nullptr, // log_probs
topp_id_vals_buf, topp_id_vals_buf,
end_offsets, end_offsets,
...@@ -709,12 +703,7 @@ public: ...@@ -709,12 +703,7 @@ public:
computeProb(h_probs, h_logits, batch_size, vocab_size); computeProb(h_probs, h_logits, batch_size, vocab_size);
cudaH2Dcpy(probs, h_probs, batch_size * vocab_size); cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
invokeTopPInitialize(topp_id_vals_buf, invokeTopPInitialize(topp_id_vals_buf, end_offsets, begin_offsets, batch_size, vocab_size, stream);
end_offsets,
begin_offsets,
batch_size,
vocab_size,
stream);
invokeBatchTopPSampling<T>(workspace, invokeBatchTopPSampling<T>(workspace,
workspace_size, workspace_size,
...@@ -773,8 +762,8 @@ public: ...@@ -773,8 +762,8 @@ public:
{ {
this->runBatchTest(param, false, false); this->runBatchTest(param, false, false);
this->runBatchTest(param, false, true); this->runBatchTest(param, false, true);
this->runBatchTest(param, true, false); this->runBatchTest(param, true, false);
this->runBatchTest(param, true, true); this->runBatchTest(param, true, true);
} }
}; };
...@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2) ...@@ -825,30 +814,31 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16}); this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
}; };
__global__ __global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) { {
int idx = threadIdx.x; int idx = threadIdx.x;
if (idx < batch_size) { if (idx < batch_size) {
vals[idx] = curand(states + idx); vals[idx] = curand(states + idx);
} }
} }
TEST(SamplingKernelTest, CurandBatchInitialize) { TEST(SamplingKernelTest, CurandBatchInitialize)
size_t batch_size = 127; {
size_t batch_size = 127;
cudaStream_t stream; cudaStream_t stream;
cudaStreamCreate(&stream); cudaStreamCreate(&stream);
curandState_t* curand_states; curandState_t* curand_states;
check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size)); check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
unsigned long long* h_random_seeds = new unsigned long long[batch_size]; unsigned long long* h_random_seeds = new unsigned long long[batch_size];
const size_t period_size = 3; const size_t period_size = 3;
for (size_t i = 0; i < batch_size; ++i) { for (size_t i = 0; i < batch_size; ++i) {
h_random_seeds[i] = i / period_size; h_random_seeds[i] = i / period_size;
} }
unsigned long long* d_random_seeds; unsigned long long* d_random_seeds;
check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size)); check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds, check_cuda_error(
sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice)); cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
// Initialize curand states. // Initialize curand states.
invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream); invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
...@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) { ...@@ -859,8 +849,8 @@ TEST(SamplingKernelTest, CurandBatchInitialize) {
unsigned int* h_rand_vals = new unsigned int[batch_size]; unsigned int* h_rand_vals = new unsigned int[batch_size];
check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size)); check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size); generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
check_cuda_error(cudaMemcpyAsync( check_cuda_error(
h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream)); cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
check_cuda_error(cudaStreamSynchronize(stream)); check_cuda_error(cudaStreamSynchronize(stream));
// The same seed produces the same random number. // The same seed produces the same random number.
......
This diff is collapsed.
#include <iostream> #include <iostream>
#include <vector>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -10,16 +10,17 @@ using namespace turbomind; ...@@ -10,16 +10,17 @@ using namespace turbomind;
namespace { namespace {
#define EXPECT_EQUAL_TENSORS(t1, t2) \ #define EXPECT_EQUAL_TENSORS(t1, t2) \
do { \ do { \
EXPECT_TRUE(t1.where == t2.where); \ EXPECT_TRUE(t1.where == t2.where); \
EXPECT_TRUE(t1.type == t2.type); \ EXPECT_TRUE(t1.type == t2.type); \
EXPECT_TRUE(t1.shape == t2.shape); \ EXPECT_TRUE(t1.shape == t2.shape); \
EXPECT_TRUE(t1.data == t2.data); \ EXPECT_TRUE(t1.data == t2.data); \
} while(false) } while (false)
TEST(TensorMapTest, HasKeyCorrectness) { TEST(TensorMapTest, HasKeyCorrectness)
bool* v1 = new bool(true); {
bool* v1 = new bool(true);
float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
...@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) { ...@@ -33,8 +34,9 @@ TEST(TensorMapTest, HasKeyCorrectness) {
delete[] v2; delete[] v2;
} }
TEST(TensorMapTest, InsertCorrectness) { TEST(TensorMapTest, InsertCorrectness)
int* v1 = new int[4]{1, 10, 20, 30}; {
int* v1 = new int[4]{1, 10, 20, 30};
float* v2 = new float[2]{1.0f, 2.0f}; float* v2 = new float[2]{1.0f, 2.0f};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2); Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
...@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) { ...@@ -46,7 +48,8 @@ TEST(TensorMapTest, InsertCorrectness) {
EXPECT_FALSE(map.isExist("t2")); EXPECT_FALSE(map.isExist("t2"));
} }
TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) { TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
{
TensorMap map; TensorMap map;
EXPECT_TRUE(map.size() == 0); EXPECT_TRUE(map.size() == 0);
// forbid a none tensor. // forbid a none tensor.
...@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) { ...@@ -57,10 +60,11 @@ TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error); EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
} }
TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) { TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
int* v1 = new int[4]{1, 10, 20, 30}; {
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); int* v1 = new int[4]{1, 10, 20, 30};
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
TensorMap map({{"t1", t1}}); TensorMap map({{"t1", t1}});
EXPECT_TRUE(map.size() == 1); EXPECT_TRUE(map.size() == 1);
// forbid a duplicated key. // forbid a duplicated key.
...@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) { ...@@ -68,8 +72,9 @@ TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
delete[] v1; delete[] v1;
} }
TEST(TensorMapTest, GetValCorrectness) { TEST(TensorMapTest, GetValCorrectness)
int* v1 = new int[4]{1, 10, 20, 30}; {
int* v1 = new int[4]{1, 10, 20, 30};
Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
TensorMap map({{"t1", t1}}); TensorMap map({{"t1", t1}});
...@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) { ...@@ -93,13 +98,14 @@ TEST(TensorMapTest, GetValCorrectness) {
delete[] v1; delete[] v1;
} }
TEST(TensorMapTest, GetTensorCorrectness) { TEST(TensorMapTest, GetTensorCorrectness)
bool* t1_val = new bool(true); {
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3}; int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
TensorMap map({{"t1", t1}, {"t2", t2}}); TensorMap map({{"t1", t1}, {"t2", t2}});
...@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) { ...@@ -114,13 +120,14 @@ TEST(TensorMapTest, GetTensorCorrectness) {
delete[] t1_val; delete[] t1_val;
} }
TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) { TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
bool* t1_val = new bool(true); {
bool* t1_val = new bool(true);
float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
int* default_val = new int[4]{0, 1, 2, 3}; int* default_val = new int[4]{0, 1, 2, 3};
Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
const TensorMap map({{"t1", t1}, {"t2", t2}}); const TensorMap map({{"t1", t1}, {"t2", t2}});
...@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) { ...@@ -135,7 +142,8 @@ TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
delete[] t1_val; delete[] t1_val;
} }
TEST(TensorTest, EmptyTensorMinMaxRaiseError) { TEST(TensorTest, EmptyTensorMinMaxRaiseError)
{
Tensor t1; Tensor t1;
EXPECT_THROW(t1.min<int>(), std::runtime_error); EXPECT_THROW(t1.min<int>(), std::runtime_error);
EXPECT_THROW(t1.max<int>(), std::runtime_error); EXPECT_THROW(t1.max<int>(), std::runtime_error);
...@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) { ...@@ -145,22 +153,22 @@ TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
EXPECT_THROW(t2.max<int>(), std::runtime_error); EXPECT_THROW(t2.max<int>(), std::runtime_error);
} }
using TensorTypes = testing::Types<int8_t, int, float>; using TensorTypes = testing::Types<int8_t, int, float>;
template <typename T> template<typename T>
class TensorFuncTest : public testing::Test {}; class TensorFuncTest: public testing::Test {};
TYPED_TEST_SUITE(TensorFuncTest, TensorTypes); TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
TYPED_TEST(TensorFuncTest, MaxCorrectness) { TYPED_TEST(TensorFuncTest, MaxCorrectness)
{
using T = TypeParam; using T = TypeParam;
size_t size = 4; size_t size = 4;
T* v1 = new T[size] {T(1), T(2), T(3), T(4)}; T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
T* v2 = new T[size] {T(4), T(3), T(2), T(1)}; T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
T* v3 = new T[size] {T(1), T(2), T(4), T(3)}; T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1); Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2); Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
...@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) { ...@@ -175,7 +183,8 @@ TYPED_TEST(TensorFuncTest, MaxCorrectness) {
delete[] v3; delete[] v3;
} }
TYPED_TEST(TensorFuncTest, MinCorrectness) { TYPED_TEST(TensorFuncTest, MinCorrectness)
{
using T = TypeParam; using T = TypeParam;
size_t size = 4; size_t size = 4;
...@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) { ...@@ -197,42 +206,45 @@ TYPED_TEST(TensorFuncTest, MinCorrectness) {
delete[] v3; delete[] v3;
} }
TYPED_TEST(TensorFuncTest, AnyCorrectness) { TYPED_TEST(TensorFuncTest, AnyCorrectness)
{
using T = TypeParam; using T = TypeParam;
T* v = new T[4]{T(1), T(2), T(3), T(4)}; T* v = new T[4]{T(1), T(2), T(3), T(4)};
Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v}; Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
EXPECT_TRUE(t.any<T>(T(1))); EXPECT_TRUE(t.any<T>(T(1)));
EXPECT_FALSE(t.any<T>(T(5))); EXPECT_FALSE(t.any<T>(T(5)));
delete[] v; delete[] v;
} }
TYPED_TEST(TensorFuncTest, AllCorrectness) { TYPED_TEST(TensorFuncTest, AllCorrectness)
{
using T = TypeParam; using T = TypeParam;
constexpr size_t size = 4; constexpr size_t size = 4;
T* v1 = new T[size]{T(1), T(1), T(1), T(1)}; T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
T* v2 = new T[size]{T(1), T(1), T(1), T(2)}; T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1}; Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2}; Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
EXPECT_TRUE(t1.all<T>(T(1))); EXPECT_TRUE(t1.all<T>(T(1)));
EXPECT_FALSE(t2.all<T>(T(2))); EXPECT_FALSE(t2.all<T>(T(2)));
delete[] v1; delete[] v1;
delete[] v2; delete[] v2;
} }
TYPED_TEST(TensorFuncTest, SliceCorrectness) { TYPED_TEST(TensorFuncTest, SliceCorrectness)
{
using T = TypeParam; using T = TypeParam;
constexpr int size = 12; constexpr int size = 12;
T* v = new T[size]; T* v = new T[size];
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
v[i] = i; v[i] = i;
} }
DataType dtype = getTensorType<T>(); DataType dtype = getTensorType<T>();
Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v); Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
Tensor t2 = t1.slice({2, 4}, 4); Tensor t2 = t1.slice({2, 4}, 4);
EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4])); EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
// An overflowed tensor throws an exception. // An overflowed tensor throws an exception.
...@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) { ...@@ -241,4 +253,4 @@ TYPED_TEST(TensorFuncTest, SliceCorrectness) {
delete[] v; delete[] v;
} }
} // end of namespace } // end of namespace
...@@ -16,15 +16,15 @@ ...@@ -16,15 +16,15 @@
#pragma once #pragma once
#include <algorithm> // min, max #include <algorithm> // min, max
#include <assert.h> // assert #include <assert.h> // assert
#include <float.h> // FLT_MAX #include <float.h> // FLT_MAX
#include <iostream> // snprintf #include <iostream> // snprintf
#include <math.h> // expf, log #include <limits> // numeric_limits
#include <limits> // numeric_limits #include <math.h> // expf, log
#include <stdlib.h> // rand #include <stdlib.h> // rand
#include <string> // string #include <string> // string
#include <vector> // vector #include <vector> // vector
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
...@@ -36,32 +36,37 @@ ...@@ -36,32 +36,37 @@
using namespace turbomind; using namespace turbomind;
class TestFailureError : public std::exception { class TestFailureError: public std::exception {
private: private:
std::string msg_; std::string msg_;
public: public:
explicit TestFailureError() = default; explicit TestFailureError() = default;
explicit TestFailureError(std::string name, std::string msg = "") { explicit TestFailureError(std::string name, std::string msg = "")
{
msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str()); msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
} }
const char* what () const throw () { const char* what() const throw()
{
return msg_.c_str(); return msg_.c_str();
} }
}; };
#define EXPECT_TRUE(cond) \ #define EXPECT_TRUE(cond) \
do { if(!(cond)) { \ do { \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ if (!(cond)) { \
__func__, #cond, __FILE__, __LINE__); \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
throw TestFailureError(__func__); \ throw TestFailureError(__func__); \
} } while(false) } \
} while (false)
#define EXPECT_FALSE(cond) \
do { if(cond) { \ #define EXPECT_FALSE(cond) \
TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", \ do { \
__func__, #cond, __FILE__, __LINE__); \ if (cond) { \
throw TestFailureError(__func__); \ TM_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d", __func__, #cond, __FILE__, __LINE__); \
} } while(false) throw TestFailureError(__func__); \
} \
} while (false)
bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
{ {
...@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) ...@@ -80,9 +85,11 @@ bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
} }
template<typename T> template<typename T>
bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) { bool checkResult(std::string name, T* out, T* ref, size_t size, float atol, float rtol)
size_t failures = 0; {
float relative_gap = 0.0f;; size_t failures = 0;
float relative_gap = 0.0f;
;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
// The values for the output and the reference. // The values for the output and the reference.
...@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float ...@@ -109,18 +116,21 @@ bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float
// Allow not matched up to 1% elements. // Allow not matched up to 1% elements.
size_t tol_failures = (size_t)(0.01 * size); size_t tol_failures = (size_t)(0.01 * size);
TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)", TM_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
failures <= tol_failures ? "....OK" : "FAILED", name.c_str(), failures <= tol_failures ? "....OK" : "FAILED",
100. * failures / size, atol, rtol, 100. * relative_gap); name.c_str(),
100. * failures / size,
atol,
rtol,
100. * relative_gap);
return failures <= tol_failures; return failures <= tol_failures;
} }
template<typename T> template<typename T>
bool checkResult(std::string name, T* out, T* ref, size_t size, bool checkResult(std::string name, T* out, T* ref, size_t size, bool device_out = true, bool device_ref = false)
bool device_out = true, bool device_ref = false)
{ {
bool is_fp32 = sizeof(T) == 4; bool is_fp32 = sizeof(T) == 4;
float atol = is_fp32 ? 1e-4f : 1e-3f; float atol = is_fp32 ? 1e-4f : 1e-3f;
float rtol = is_fp32 ? 1e-2f : 1e-1f; float rtol = is_fp32 ? 1e-2f : 1e-1f;
T* h_out = nullptr; T* h_out = nullptr;
if (device_out) { if (device_out) {
...@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size, ...@@ -135,7 +145,7 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
ref = h_ref; ref = h_ref;
} }
bool is_ok = checkResult(name, out, ref, size, atol, rtol); bool is_ok = checkResult(name, out, ref, size, atol, rtol);
if (h_out != nullptr){ if (h_out != nullptr) {
delete[] h_out; delete[] h_out;
} }
if (h_ref != nullptr) { if (h_ref != nullptr) {
...@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size, ...@@ -145,7 +155,8 @@ bool checkResult(std::string name, T* out, T* ref, size_t size,
} }
template<typename T> template<typename T>
void initRandom(T* ptr, size_t size, float minval, float maxval) { void initRandom(T* ptr, size_t size, float minval, float maxval)
{
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
val *= (maxval - minval); val *= (maxval - minval);
...@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) { ...@@ -153,7 +164,8 @@ void initRandom(T* ptr, size_t size, float minval, float maxval) {
} }
} }
void initRandomInt(int* ptr, size_t size, int minval, int maxval) { void initRandomInt(int* ptr, size_t size, int minval, int maxval)
{
assert(minval < maxval); assert(minval < maxval);
int mod = maxval - minval; int mod = maxval - minval;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
...@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) { ...@@ -162,7 +174,8 @@ void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
} }
template<typename T> template<typename T>
void tile(T* x, int m, int n) { void tile(T* x, int m, int n)
{
for (int i = 1; i < m; ++i) { for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) { for (int j = 0; j < n; ++j) {
x[i * n + j] = x[j]; x[i * n + j] = x[j];
...@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) { ...@@ -171,7 +184,8 @@ void tile(T* x, int m, int n) {
} }
template<typename T> template<typename T>
void tile(T* dst, T* src, int m, int n) { void tile(T* dst, T* src, int m, int n)
{
for (int i = 1; i < m; ++i) { for (int i = 1; i < m; ++i) {
for (int j = 0; j < n; ++j) { for (int j = 0; j < n; ++j) {
dst[i * n + j] = src[j]; dst[i * n + j] = src[j];
...@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) { ...@@ -182,11 +196,13 @@ void tile(T* dst, T* src, int m, int n) {
#define HALF_FLT_MAX 65504.0f #define HALF_FLT_MAX 65504.0f
template<typename T> template<typename T>
bool isHalf() { bool isHalf()
{
return std::is_same<T, half>::value; return std::is_same<T, half>::value;
} }
template<typename T> template<typename T>
static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) { static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr); printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment