Commit 892cb743 authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into bf16_int8_ckprofiler

parents ba132d28 992f71e3
...@@ -45,7 +45,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpeciali ...@@ -45,7 +45,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpeciali
static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding; static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
// clang-format off // clang-format off
#if 1 #if 0
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num| //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| Prefetch| //######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| Prefetch|
...@@ -53,6 +53,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl ...@@ -53,6 +53,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// [256, 128, 4, 8], 1 stage, 2 occupancy // [256, 128, 4, 8], 1 stage, 2 occupancy
< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1, 1>; < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1, 1>;
#elif 1
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
//######|AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>;
#elif 0 #elif 0
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num| //######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num|
...@@ -82,14 +89,6 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl ...@@ -82,14 +89,6 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
// [ 64, 144, 4, 8], 2 stage, 2 occupancy // [ 64, 144, 4, 8], 2 stage, 2 occupancy
// 85 TFlops // 85 TFlops
// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 144, 4, 8, 16, 16, 1, 9, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1, 2>; // < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 144, 4, 8, 16, 16, 1, 9, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1, 2>;
#elif 1
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Num|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Prefetch|
//######| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| |
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// [128, 144, 8, 8], 1 stage, 1 occupancy, bounded by LDS size
< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, 256, 128, 144, 8, 8, 16, 16, 2, 9, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 8, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 1, 9, S<1, 1, 8, 1, 9, 2>, 8, 1>;
#endif #endif
// clang-format on // clang-format on
...@@ -170,9 +169,13 @@ int main(int argc, char* argv[]) ...@@ -170,9 +169,13 @@ int main(int argc, char* argv[])
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
default: case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
......
add_subdirectory(host_tensor) add_subdirectory(host_tensor)
\ No newline at end of file
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
#include <iostream> #include <iostream>
#include "data_type.hpp" #include "data_type.hpp"
template <typename Range> template <typename Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
{ {
......
...@@ -153,4 +153,15 @@ struct GeneratorTensor_Checkboard ...@@ -153,4 +153,15 @@ struct GeneratorTensor_Checkboard
} }
}; };
template <ck::index_t Dim>
struct GeneratorTensor_Sequential
{
template <typename... Ts>
float operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
return dims[Dim];
}
};
#endif #endif
...@@ -5,4 +5,4 @@ ignore = pcre ...@@ -5,4 +5,4 @@ ignore = pcre
deps = deps =
-f dev-requirements.txt -f dev-requirements.txt
define = define =
BUILD_DEV=On BUILD_DEV=On
\ No newline at end of file
...@@ -13,58 +13,24 @@ include_directories(BEFORE ...@@ -13,58 +13,24 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/test/include ${PROJECT_SOURCE_DIR}/test/include
) )
# test_magic_number_division add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
set(MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp) add_custom_target(tests)
add_executable(test_magic_number_division ${MAGIC_NUMBER_DIVISISON_SOURCE})
target_link_libraries(test_magic_number_division PRIVATE host_tensor) function(add_test_executeable TEST_NAME)
add_executable(${TEST_NAME} ${ARGN})
target_link_libraries(${TEST_NAME} PRIVATE host_tensor)
set(CONV2D_FWD_SOURCE conv2d_fwd/main.cpp) target_link_libraries(${TEST_NAME} PRIVATE device_gemm_instance)
target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_fwd_instance)
add_executable(test_conv2d_fwd ${CONV2D_FWD_SOURCE}) add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
target_link_libraries(test_conv2d_fwd PRIVATE host_tensor) add_dependencies(tests ${TEST_NAME})
target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance) add_dependencies(check ${TEST_NAME})
endfunction(add_test_executeable TEST_NAME)
# test_split_k
set(SPLIT_K_SOURCE split_k/main.cpp)
add_executable(test_split_k ${SPLIT_K_SOURCE}) file(GLOB TESTS *.cpp)
target_link_libraries(test_split_k PRIVATE host_tensor)
target_link_libraries(test_split_k PRIVATE device_gemm_instance) foreach(TEST ${TESTS})
get_filename_component(BASE_NAME ${TEST} NAME_WE)
# test_conv_util message("adding test ${BASE_NAME}")
set(CONV_UTIL_SOURCE conv_util/main.cpp) add_test_executeable(test_${BASE_NAME} ${TEST})
add_executable(test_conv_util ${CONV_UTIL_SOURCE}) endforeach(TEST ${TESTS})
target_link_libraries(test_conv_util PRIVATE host_tensor)
# test_reference_conv_fwd
set(REFERENCE_CONV_FWD_SOURCE reference_conv_fwd/main.cpp)
add_executable(test_reference_conv_fwd ${REFERENCE_CONV_FWD_SOURCE})
target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor)
# test_convnd_fwd_xdl
set(CONVND_FWD_XDL_SOURCE convnd_fwd_xdl/main.cpp)
add_executable(test_convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
target_link_libraries(test_convnd_fwd_xdl PRIVATE host_tensor)
# test_gemm_xdl_fp32
set(GEMM_XDL_FP32_SOURCE gemm_xdl/test_gemm_fp32.cpp)
add_executable(test_gemm_xdl_fp32 ${GEMM_XDL_FP32_SOURCE})
target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
# test_gemm_xdl_bf16
set(GEMM_XDL_BF16_SOURCE gemm_xdl/test_gemm_bf16.cpp)
add_executable(test_gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
# test_gemm_xdl_int8
set(GEMM_XDL_INT8_SOURCE gemm_xdl/test_gemm_int8.cpp)
add_executable(test_gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
# test space_filling_curve_
set(SPACE_FILLING_CURVE_SOURCE space_filling_curve/space_filling_curve.cpp)
add_executable(space_filling_curve ${SPACE_FILLING_CURVE_SOURCE})
target_link_libraries(space_filling_curve PRIVATE host_tensor)
...@@ -75,8 +75,12 @@ int main(int argc, char* argv[]) ...@@ -75,8 +75,12 @@ int main(int argc, char* argv[])
ck::index_t in_left_pad_w = 1; ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1; ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1; ck::index_t in_right_pad_w = 1;
if(argc == 1)
if(argc == 3) {
init_method = 1;
data_type = 0;
}
else if(argc == 3)
{ {
data_type = std::stoi(argv[1]); data_type = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
...@@ -275,20 +279,22 @@ int main(int argc, char* argv[]) ...@@ -275,20 +279,22 @@ int main(int argc, char* argv[])
if(success) if(success)
{ {
std::cout << "test conv2d fwd : Pass" << std::endl; std::cout << "test conv2d fwd : Pass" << std::endl;
return 0;
} }
else else
{ {
std::cout << "test conv2d fwd: Fail " << std::endl; std::cout << "test conv2d fwd: Fail " << std::endl;
return -1;
} }
}; };
int res = -1;
if(data_type == 0) if(data_type == 0)
{ {
Run(float(), float(), float()); res = Run(float(), float(), float());
} }
else if(data_type == 1) else if(data_type == 1)
{ {
Run(ck::half_t(), ck::half_t(), ck::half_t()); res = Run(ck::half_t(), ck::half_t(), ck::half_t());
} }
else if(data_type == 2) else if(data_type == 2)
{ {
...@@ -296,12 +302,8 @@ int main(int argc, char* argv[]) ...@@ -296,12 +302,8 @@ int main(int argc, char* argv[])
} }
else if(data_type == 3) else if(data_type == 3)
{ {
Run(int8_t(), int8_t(), int8_t()); res = Run(int8_t(), int8_t(), int8_t());
}
else
{
return 1;
} }
return 0; return res;
} }
...@@ -111,7 +111,8 @@ bool TestGemm(DeviceGemmPtr_& gemmPtr) ...@@ -111,7 +111,8 @@ bool TestGemm(DeviceGemmPtr_& gemmPtr)
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
// Assert // Assert
bool res = test_util::check_err(c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); bool res = test_util::check_err(
c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
......
...@@ -161,11 +161,11 @@ int main(int, char*[]) ...@@ -161,11 +161,11 @@ int main(int, char*[])
if(pass) if(pass)
{ {
std::cout << "test magic number division: Pass" << std::endl; std::cout << "test magic number division: Pass" << std::endl;
return 0;
} }
else else
{ {
std::cout << "test magic number division: Fail" << std::endl; std::cout << "test magic number division: Fail" << std::endl;
return -1;
} }
return 1;
} }
...@@ -57,32 +57,23 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result) ...@@ -57,32 +57,23 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
return true; return true;
} }
int main(int argc, char* argv[]) struct gemmArgs
{ {
if(argc != 9) int layout;
{ int M;
printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); int N;
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); int K;
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); int StrideA;
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); int StrideB;
printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n"); int StrideC;
return 1; int KBatch;
} };
const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
const int M = std::stoi(argv[2]);
const int N = std::stoi(argv[3]);
const int K = std::stoi(argv[4]);
const int StrideA = std::stoi(argv[5]);
const int StrideB = std::stoi(argv[6]);
const int StrideC = std::stoi(argv[7]);
const int KBatch = std::stoi(argv[8]);
int test_gemm(const gemmArgs& args)
{
bool a_row_major, b_row_major, c_row_major; bool a_row_major, b_row_major, c_row_major;
switch(layout) switch(args.layout)
{ {
case GemmMatrixLayout::MK_KN_MN: case GemmMatrixLayout::MK_KN_MN:
a_row_major = true; a_row_major = true;
...@@ -121,10 +112,12 @@ int main(int argc, char* argv[]) ...@@ -121,10 +112,12 @@ int main(int argc, char* argv[])
} }
}; };
Tensor<float> a_m_k(f_host_tensor_descriptor(M, K, StrideA, a_row_major)); Tensor<float> a_m_k(f_host_tensor_descriptor(args.M, args.K, args.StrideA, a_row_major));
Tensor<float> b_k_n(f_host_tensor_descriptor(K, N, StrideB, b_row_major)); Tensor<float> b_k_n(f_host_tensor_descriptor(args.K, args.N, args.StrideB, b_row_major));
Tensor<float> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, c_row_major)); Tensor<float> c_m_n_host_result(
Tensor<float> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, c_row_major)); f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
Tensor<float> c_m_n_device_result(
f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
// init data // init data
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -151,17 +144,17 @@ int main(int argc, char* argv[]) ...@@ -151,17 +144,17 @@ int main(int argc, char* argv[])
// add device GEMM instances // add device GEMM instances
std::vector<DeviceGemmNoOpPtr> gemm_ptrs; std::vector<DeviceGemmNoOpPtr> gemm_ptrs;
if(layout == GemmMatrixLayout::MK_KN_MN) if(args.layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs); add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs);
} }
else if(layout == GemmMatrixLayout::MK_NK_MN) else if(args.layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs); add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs);
} }
else if(layout == GemmMatrixLayout::KM_KN_MN) else if(args.layout == GemmMatrixLayout::KM_KN_MN)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs); add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs);
...@@ -179,16 +172,16 @@ int main(int argc, char* argv[]) ...@@ -179,16 +172,16 @@ int main(int argc, char* argv[])
gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()), gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
static_cast<float*>(b_device_buf.GetDeviceBuffer()), static_cast<float*>(b_device_buf.GetDeviceBuffer()),
static_cast<float*>(c_device_buf.GetDeviceBuffer()), static_cast<float*>(c_device_buf.GetDeviceBuffer()),
M, args.M,
N, args.N,
K, args.K,
StrideA, args.StrideA,
StrideB, args.StrideB,
StrideC, args.StrideC,
ck::tensor_operation::element_wise::PassThrough{}, ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}, ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}, ck::tensor_operation::element_wise::PassThrough{},
KBatch); args.KBatch);
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
...@@ -205,7 +198,7 @@ int main(int argc, char* argv[]) ...@@ -205,7 +198,7 @@ int main(int argc, char* argv[])
success = true; success = true;
} }
} }
auto error_code = 0;
if(success) if(success)
{ {
std::cout << "test split k : Pass" << std::endl; std::cout << "test split k : Pass" << std::endl;
...@@ -213,6 +206,48 @@ int main(int argc, char* argv[]) ...@@ -213,6 +206,48 @@ int main(int argc, char* argv[])
else else
{ {
std::cout << "test split k: Fail " << std::endl; std::cout << "test split k: Fail " << std::endl;
error_code = -1; // test needs to report failure
}
return error_code;
}
int main(int argc, char* argv[])
{
std::vector<gemmArgs> test_cases;
if(argc == 1)
{
test_cases = {{0, 3, 3, 3, 3, 3, 3, 1}};
// JD: Populate with more and meaningful
return 0;
}
else if(argc == 9)
{
const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
const int M = std::stoi(argv[2]);
const int N = std::stoi(argv[3]);
const int K = std::stoi(argv[4]);
const int StrideA = std::stoi(argv[5]);
const int StrideB = std::stoi(argv[6]);
const int StrideC = std::stoi(argv[7]);
const int KBatch = std::stoi(argv[8]);
test_cases = {{layout, M, N, K, StrideA, StrideB, StrideC, KBatch}};
}
else
{
printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
return -1;
}
for(const auto& kinder : test_cases)
{
const auto res = test_gemm(kinder);
if(!res)
return -1;
} }
return 0; return 0;
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment