Unverified Commit 44a85546 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

Fix build test error and move turbmind csrc test cases to `tests/csrc` (#188)

* fix build tests failure

* move src test cases to tests/csrc
parent 5545bbc5
...@@ -285,7 +285,7 @@ add_subdirectory(src) ...@@ -285,7 +285,7 @@ add_subdirectory(src)
add_subdirectory(examples) add_subdirectory(examples)
if(BUILD_TEST) if(BUILD_TEST)
add_subdirectory(tests) add_subdirectory(tests/csrc)
endif() endif()
# # Mesaure the compile time # # Mesaure the compile time
......
...@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float* cum_log_probs, ...@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float* cum_log_probs,
cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first); cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first);
} }
// template void invokeLogProbFromLogits(float* cum_log_probs, template void invokeLogProbFromLogits(float* cum_log_probs,
// const float* logits, const float* logits,
// const int* input_ids, const int* input_ids,
// const int* input_lengths, const int* input_lengths,
// const size_t max_input_length, const size_t max_input_length,
// const size_t batch_size, const size_t batch_size,
// const size_t vocab_size, const size_t vocab_size,
// const size_t vocab_size_padded, const size_t vocab_size_padded,
// void* workspace, void* workspace,
// const size_t workspace_size, const size_t workspace_size,
// cudaStream_t stream, cudaStream_t stream,
// const bool batch_first); const bool batch_first);
// template void invokeLogProbFromLogits(float* cum_log_probs, template void invokeLogProbFromLogits(float* cum_log_probs,
// const half* logits, const half* logits,
// const int* input_ids, const int* input_ids,
// const int* input_lengths, const int* input_lengths,
// const size_t max_input_length, const size_t max_input_length,
// const size_t batch_size, const size_t batch_size,
// const size_t vocab_size, const size_t vocab_size,
// const size_t vocab_size_padded, const size_t vocab_size_padded,
// void* workspace, void* workspace,
// const size_t workspace_size, const size_t workspace_size,
// cudaStream_t stream, cudaStream_t stream,
// const bool batch_first); const bool batch_first);
} // end of namespace turbomind } // end of namespace turbomind
...@@ -15,6 +15,5 @@ ...@@ -15,6 +15,5 @@
add_subdirectory(unittests) add_subdirectory(unittests)
if(BUILD_PYT) if(BUILD_PYT)
add_subdirectory(gemm_dequantize) add_subdirectory(gemm_dequantize)
add_subdirectory(moe)
add_subdirectory(int8_gemm) add_subdirectory(int8_gemm)
endif() endif()
...@@ -42,7 +42,7 @@ target_compile_features(unittest PRIVATE cxx_std_14) ...@@ -42,7 +42,7 @@ target_compile_features(unittest PRIVATE cxx_std_14)
target_link_libraries( # Libs for test_attention_kernels target_link_libraries( # Libs for test_attention_kernels
unittest PUBLIC unittest PUBLIC
-lcudart -lcurand -lcudart -lcurand
gen_relative_pos_bias gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger) gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
target_link_libraries( # Libs for test_logprob_kernels target_link_libraries( # Libs for test_logprob_kernels
unittest PUBLIC unittest PUBLIC
-lcudart -lcudart
...@@ -50,7 +50,7 @@ target_link_libraries( # Libs for test_logprob_kernels ...@@ -50,7 +50,7 @@ target_link_libraries( # Libs for test_logprob_kernels
target_link_libraries( # Libs for test_penalty_kernels target_link_libraries( # Libs for test_penalty_kernels
unittest PUBLIC unittest PUBLIC
-lcublas -lcublasLt -lcudart -lcublas -lcublasLt -lcudart
sampling_penalty_kernels beam_search_penalty_kernels memory_utils cuda_utils logger) sampling_penalty_kernels memory_utils cuda_utils logger)
target_link_libraries( # Libs for test_sampling_kernel target_link_libraries( # Libs for test_sampling_kernel
unittest PUBLIC unittest PUBLIC
-lcudart -lcudart
...@@ -71,11 +71,6 @@ add_executable(test_gpt_kernels test_gpt_kernels.cu) ...@@ -71,11 +71,6 @@ add_executable(test_gpt_kernels test_gpt_kernels.cu)
target_link_libraries(test_gpt_kernels PUBLIC target_link_libraries(test_gpt_kernels PUBLIC
gpt_kernels memory_utils tensor cuda_utils logger) gpt_kernels memory_utils tensor cuda_utils logger)
add_executable(test_activation test_activation.cu)
target_link_libraries(test_activation PUBLIC
-lcublas -lcublasLt -lcudart
activation_kernels memory_utils cuda_utils logger)
add_executable(test_context_attention_layer test_context_attention_layer.cu) add_executable(test_context_attention_layer test_context_attention_layer.cu)
target_link_libraries(test_context_attention_layer PUBLIC target_link_libraries(test_context_attention_layer PUBLIC
Llama -lcublas -lcublasLt -lcudart Llama -lcublas -lcublasLt -lcudart
......
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include "tests/unittests/gtest_utils.h" #include "gtest_utils.h"
#include <curand.h> #include <curand.h>
#include <sstream> #include <sstream>
...@@ -333,121 +333,6 @@ public: ...@@ -333,121 +333,6 @@ public:
EXPECT_TRUE(passed); EXPECT_TRUE(passed);
} }
} }
void runTestAlibiMaskedSoftmax(AttentionKernelTestParam param, bool is_benchmark = false)
{
DataType dtype = getTensorType<T>();
std::vector<size_t> qk_shape{param.batch_size, param.head_num, param.q_length, param.k_length};
bool use_fp32_qk = param.use_fp32_qk_buf && dtype != TYPE_FP32;
Tensor qk = createTensor(MEMORY_GPU, dtype, qk_shape);
Tensor qk_fp32 = use_fp32_qk ? createTensor(MEMORY_GPU, TYPE_FP32, qk_shape) : Tensor();
Tensor attn_mask = randomAttentionMask({param.batch_size, 1, param.q_length, param.k_length});
Tensor alibi_slopes = createTensor(MEMORY_GPU, dtype, {param.head_num});
// Input random initialization
if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
utils::normal<float>(curng, qk_fp32);
}
else {
utils::normal<T>(curng, qk);
}
invokeBuildAlibiSlopes(alibi_slopes.getPtr<T>(), param.head_num, stream);
sync_check_cuda_error();
Tensor h_alibi_slopes = createTensor(MEMORY_CPU, dtype, {param.head_num});
Tensor h_alibi_bias =
is_benchmark ? Tensor() : createTensor(MEMORY_CPU, dtype, {param.head_num, param.q_length, param.k_length});
// The nearest power of 2 equal to / smaller than num_heads followed by HF's implementation.
T* alibi_slope_ptr = h_alibi_slopes.getPtr<T>();
int num_heads_pow2 = utils::pow2_rounddown(param.head_num);
for (size_t h = 0; h < param.head_num; ++h) {
// The slope of linear bias of the attention head
if (h < num_heads_pow2) {
alibi_slope_ptr[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
}
else {
alibi_slope_ptr[h] = static_cast<T>(
powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
}
if (h_alibi_bias.size() > 0) {
T* alibi_bias_ptr = h_alibi_bias.getPtr<T>();
for (size_t qi = 0; qi < param.q_length; ++qi) {
for (size_t ki = 0; ki < param.k_length; ++ki) {
size_t hqk_idx = (h * param.q_length + qi) * param.k_length + ki;
alibi_bias_ptr[hqk_idx] = ::math::mul(alibi_slope_ptr[h], T(0.0f + ki - qi));
}
}
}
}
EXPECT_TRUE(
checkResult("CheckAlibiSlopes", alibi_slopes.getPtr<T>(), h_alibi_slopes.getPtr<T>(), param.head_num));
// Clone to host for reference computation if needed.
Tensor h_qk = is_benchmark ? Tensor() : toHost<T>(qk);
Tensor h_attn_mask = is_benchmark ? Tensor() : toHost<T>(attn_mask);
Tensor h_qk_fp32 = is_benchmark ? Tensor() : toHost<float>(qk_fp32);
T scale = static_cast<T>(1 / sqrtf(param.size_per_head * 1.0f));
if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
MaskedSoftmaxParam<T, float> softmax_param;
softmax_param.attention_score = qk.getPtr<T>();
softmax_param.qk = qk_fp32.getPtr<float>();
softmax_param.attention_mask = attn_mask.getPtr<T>();
softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
softmax_param.batch_size = param.batch_size;
softmax_param.num_heads = param.head_num;
softmax_param.q_length = param.q_length;
softmax_param.k_length = param.k_length;
softmax_param.qk_scale = scale;
invokeMaskedSoftmax(softmax_param, stream);
sync_check_cuda_error();
}
else {
MaskedSoftmaxParam<T, T> softmax_param;
softmax_param.attention_score = qk.getPtr<T>();
softmax_param.qk = qk.getPtr<T>();
softmax_param.attention_mask = attn_mask.getPtr<T>();
softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
softmax_param.batch_size = param.batch_size;
softmax_param.num_heads = param.head_num;
softmax_param.q_length = param.q_length;
softmax_param.k_length = param.k_length;
softmax_param.qk_scale = scale;
invokeMaskedSoftmax(softmax_param, stream);
sync_check_cuda_error();
}
if (!is_benchmark) {
if (use_fp32_qk) {
computeQkSoftmax(h_qk.getPtr<T>(),
h_qk_fp32.getPtr<T>(),
h_attn_mask.getPtr<T>(),
h_alibi_bias.getPtr<T>(),
param.batch_size,
param.head_num,
param.q_length,
param.k_length,
scale);
}
else {
computeQkSoftmax(h_qk.getPtr<T>(),
h_qk.getPtr<T>(),
h_attn_mask.getPtr<T>(),
h_alibi_bias.getPtr<T>(),
param.batch_size,
param.head_num,
param.q_length,
param.k_length,
scale);
}
bool passed = checkResult("AlibiMaskedSoftmax", qk.getPtr<T>(), h_qk.getPtr<T>(), qk.size());
EXPECT_TRUE(passed);
}
}
}; };
TYPED_TEST_SUITE(AttentionKernelTest, SupportTypes); TYPED_TEST_SUITE(AttentionKernelTest, SupportTypes);
...@@ -511,48 +396,4 @@ TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence4096) ...@@ -511,48 +396,4 @@ TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence4096)
this->runTestMaskedSoftmax({8, 4096, 4096, 14, 128, false, 0, false, true}, true); this->runTestMaskedSoftmax({8, 4096, 4096, 14, 128, false, 0, false, true}, true);
} }
TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence1)
{
this->runTestAlibiMaskedSoftmax({1, 12, 12, 4, 32, false, 0, false});
}
TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence2)
{
// q_length is not multiple of 4.
this->runTestAlibiMaskedSoftmax({1, 11, 11, 4, 32, false, 0, false});
}
TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt1)
{
this->runTestAlibiMaskedSoftmax({1, 12, 20, 4, 32, false, 0, false});
}
TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt2)
{
// q_length is not multiple of 4.
this->runTestAlibiMaskedSoftmax({1, 11, 20, 4, 32, false, 0, false});
}
// Tests for long sentence generation. Assume the bloom 176B model with 8 TP.
TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence1024)
{
this->runTestAlibiMaskedSoftmax({8, 1024, 1024, 14, 128, false, 0, false, true}, true);
}
TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence2048)
{
this->runTestAlibiMaskedSoftmax({8, 2048, 2048, 14, 128, false, 0, false, true}, true);
}
TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence3072)
{
this->runTestAlibiMaskedSoftmax({8, 3072, 3072, 14, 128, false, 0, false, true}, true);
}
TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence4096)
{
this->runTestAlibiMaskedSoftmax({4, 4096, 4096, 14, 128, false, 0, false, true}, true);
}
} // end of namespace } // end of namespace
...@@ -360,6 +360,7 @@ int main(int argc, const char* argv[]) ...@@ -360,6 +360,7 @@ int main(int argc, const char* argv[])
.out_accum = accum_buf_ptr, .out_accum = accum_buf_ptr,
.cu_seqlens_q = cu_seqlens_ptr, .cu_seqlens_q = cu_seqlens_ptr,
.cu_seqlens_k = nullptr, .cu_seqlens_k = nullptr,
.group_size = 1,
.layout_q = layout_q, .layout_q = layout_q,
.layout_k = layout_k, .layout_k = layout_k,
.layout_v = layout_v, .layout_v = layout_v,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include <iostream> #include <iostream>
#include <random> #include <random>
#include "tests/unittests/gtest_utils.h" #include "gtest_utils.h"
using namespace turbomind; using namespace turbomind;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h" #include "gtest_utils.h"
using namespace turbomind; using namespace turbomind;
......
...@@ -27,14 +27,11 @@ ...@@ -27,14 +27,11 @@
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "gtest_utils.h"
// #include "tests/unittests/unittest_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace turbomind; using namespace turbomind;
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "tests/unittests/gtest_utils.h" #include "gtest_utils.h"
using namespace turbomind; using namespace turbomind;
......
...@@ -17,8 +17,7 @@ ...@@ -17,8 +17,7 @@
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
// #include "tests/unittests/unittest_utils.h" #include "gtest_utils.h"
#include "tests/unittests/gtest_utils.h"
using namespace turbomind; using namespace turbomind;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment