Commit 19bb36d7 authored by wangshaojie6's avatar wangshaojie6
Browse files

merge dev branch

parents 3e7f7997 2e6eaf6e
...@@ -10,7 +10,8 @@ ...@@ -10,7 +10,8 @@
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "sequence.hpp" #include "sequence.hpp"
namespace { namespace test {
namespace conv {
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: ...@@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::
1>; // CThreadTransferDstScalarPerVector 1>; // CThreadTransferDstScalarPerVector
// clang-format on // clang-format on
} // namespace
namespace test {
namespace conv {
template <ck::index_t NDim, template <ck::index_t NDim,
typename InDataType = float, typename InDataType = float,
typename WeiDataType = float, typename WeiDataType = float,
typename OutDataType = float> typename OutDataType = float>
void RunConv(const ck::utils::conv::ConvParams& params, void get_test_convolution_fwd_instance(std::vector<DeviceConvFwdNoOpPtr>& instances)
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{ {
ck::utils::conv::run_convolution_forward<NDim, using ConvInstanceT = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>;
InDataType, instances.emplace_back(std::make_unique<ConvInstanceT>());
WeiDataType,
OutDataType,
DeviceConvNDFwdInstance>(
params, input, weights, output);
} }
} // namespace conv } // namespace conv
......
#ifndef GEMM_UTILS_HPP #ifndef GEMM_UTILS_HPP
#define GEMM_UTILS_HPP #define GEMM_UTILS_HPP
#include "check_err.hpp" #include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "reference_gemm.hpp" #include "reference_gemm.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
namespace ck { namespace ck {
namespace gemm_util { namespace gemm_util {
struct GemmParams struct GemmParams
{ {
GemmParams() GemmParams()
: M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0) : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
{ {
} }
ck::index_t M; ck::index_t M;
ck::index_t N; ck::index_t N;
ck::index_t K; ck::index_t K;
ck::index_t StrideA; ck::index_t StrideA;
ck::index_t StrideB; ck::index_t StrideB;
ck::index_t StrideC; ck::index_t StrideC;
float alpha; float alpha;
float beta; float beta;
}; };
template <typename GemmInstance, template <typename GemmInstance,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
void RunHostGEMM(const Tensor<ADataType>& A, void RunHostGEMM(const Tensor<ADataType>& A,
const Tensor<BDataType>& B, const Tensor<BDataType>& B,
Tensor<CDataType>& C, Tensor<CDataType>& C,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op) CElementwiseOperation c_element_op)
{ {
auto ref_gemm = GemmInstance{}; auto ref_gemm = GemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op); auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
const ck::gemm_util::GemmParams& params, const ck::gemm_util::GemmParams& params,
const Tensor<ADataType>& A, const Tensor<ADataType>& A,
const Tensor<BDataType>& B, const Tensor<BDataType>& B,
Tensor<CDataType>& C, Tensor<CDataType>& C,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op) CElementwiseOperation c_element_op)
{ {
DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(A.mData.data()); a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data()); b_k_n_device_buf.ToDevice(B.mData.data());
auto invoker_ptr = gemmPtr->MakeInvokerPointer(); auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr = auto argument_ptr =
gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()), static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()), static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
params.M, params.M,
params.N, params.N,
params.K, params.K,
params.StrideA, params.StrideA,
params.StrideB, params.StrideB,
params.StrideC, params.StrideC,
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op); c_element_op);
if(!gemmPtr->IsSupportedArgument(argument_ptr.get())) if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
{ {
throw std::runtime_error( throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does " "wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"); "not support this GEMM problem");
} }
invoker_ptr->Run(argument_ptr.get()); invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data()); c_m_n_device_buf.FromDevice(C.mData.data());
} }
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout, typename CLayout,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
struct TestGemm struct TestGemm
{ {
auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params) auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1})); std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride})); std::vector<std::size_t>({1, stride}));
} }
}; };
Tensor<ADataType> a_m_k( Tensor<ADataType> a_m_k(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BDataType> b_k_n( Tensor<BDataType> b_k_n(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result( Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result( Tensor<CDataType> c_m_n_device_result(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
auto f_generate_tensor_value = [](auto desc, auto type) { auto f_generate_tensor_value = [](auto& desc, auto type) {
using dataType = decltype(type); using dataType = decltype(type);
if(std::is_same<dataType, int8_t>::value) if(std::is_same<dataType, int8_t>::value)
{ {
desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5}); desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
} }
else else
{ {
desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5}); desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5});
} }
}; };
f_generate_tensor_value(a_m_k, ADataType{}); f_generate_tensor_value(a_m_k, ADataType{});
f_generate_tensor_value(b_k_n, BDataType{}); f_generate_tensor_value(b_k_n, BDataType{});
return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result); return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
} }
auto operator()(DeviceGemmPtr_& gemmPtr) auto operator()(DeviceGemmPtr_& gemmPtr)
{ {
std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
<< ", CLayout = " << CLayout{}.name << std::endl; << ", CLayout = " << CLayout{}.name << std::endl;
std::cout << gemmPtr->GetTypeString() << std::endl; std::cout << gemmPtr->GetTypeString() << std::endl;
// Arrange // Arrange
ck::gemm_util::GemmParams params; ck::gemm_util::GemmParams params;
params.M = 1024; params.M = 1024;
params.N = 1024; params.N = 1024;
params.K = 1024; params.K = 1024;
params.StrideA = 1024; params.StrideA = 1024;
params.StrideB = 1024; params.StrideB = 1024;
params.StrideC = 1024; params.StrideC = 1024;
auto host_tensors = PrepareGemmTensor(params); auto host_tensors = PrepareGemmTensor(params);
const Tensor<ADataType>& a = std::get<0>(host_tensors); const Tensor<ADataType>& a = std::get<0>(host_tensors);
const Tensor<BDataType>& b = std::get<1>(host_tensors); const Tensor<BDataType>& b = std::get<1>(host_tensors);
Tensor<CDataType>& c_host = std::get<2>(host_tensors); Tensor<CDataType>& c_host = std::get<2>(host_tensors);
Tensor<CDataType>& c_device = std::get<3>(host_tensors); Tensor<CDataType>& c_device = std::get<3>(host_tensors);
auto a_element_op = AElementwiseOperation{}; auto a_element_op = AElementwiseOperation{};
auto b_element_op = BElementwiseOperation{}; auto b_element_op = BElementwiseOperation{};
auto c_element_op = CElementwiseOperation{}; auto c_element_op = CElementwiseOperation{};
using ReferenceGemmInstance = using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<ADataType, ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
CDataType, CDataType,
AElementwiseOperation, AElementwiseOperation,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation>; CElementwiseOperation>;
ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>( ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
a, b, c_host, a_element_op, b_element_op, c_element_op); a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act // Act
ck::gemm_util::RunDeviceGEMM( ck::gemm_util::RunDeviceGEMM(
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
// Assert // Assert
bool res = false; bool res = false;
if(std::is_same<CDataType, float>::value) if(std::is_same<CDataType, float>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, ck::half_t>::value) else if(std::is_same<CDataType, ck::half_t>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, int8_t>::value) else if(std::is_same<CDataType, int8_t>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
return res; return res;
} }
}; };
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout, typename CLayout,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
struct TestGemmBF16 struct TestGemmBF16
{ {
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params) auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1})); std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride})); std::vector<std::size_t>({1, stride}));
} }
}; };
// use fp32 host kernel to verify bf16 device kernel // use fp32 host kernel to verify bf16 device kernel
Tensor<BF16> a_m_k_bf16( Tensor<BF16> a_m_k_bf16(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<BF16> b_k_n_bf16( Tensor<BF16> b_k_n_bf16(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<BF16> c_m_n_device_bf16( Tensor<BF16> c_m_n_device_bf16(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<float> a_m_k_fp32( Tensor<float> a_m_k_fp32(
f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
Tensor<float> b_k_n_fp32( Tensor<float> b_k_n_fp32(
f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
Tensor<float> c_m_n_host_fp32( Tensor<float> c_m_n_host_fp32(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
Tensor<float> c_m_n_device_fp32( Tensor<float> c_m_n_device_fp32(
f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5}); a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5}); b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
bf16_to_f32_(a_m_k_bf16, a_m_k_fp32); bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
bf16_to_f32_(b_k_n_bf16, b_k_n_fp32); bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
return std::make_tuple(a_m_k_bf16, return std::make_tuple(a_m_k_bf16,
b_k_n_bf16, b_k_n_bf16,
c_m_n_device_bf16, c_m_n_device_bf16,
a_m_k_fp32, a_m_k_fp32,
b_k_n_fp32, b_k_n_fp32,
c_m_n_host_fp32, c_m_n_host_fp32,
c_m_n_device_fp32); c_m_n_device_fp32);
} }
auto operator()(DeviceGemmPtr_& gemmPtr) auto operator()(DeviceGemmPtr_& gemmPtr)
{ {
// Arrange // Arrange
ck::gemm_util::GemmParams params; ck::gemm_util::GemmParams params;
params.M = 1024; params.M = 1024;
params.N = 1024; params.N = 1024;
params.K = 1024; params.K = 1024;
params.StrideA = 1024; params.StrideA = 1024;
params.StrideB = 1024; params.StrideB = 1024;
params.StrideC = 1024; params.StrideC = 1024;
auto host_tensors = PrepareGemmTensorBF16(params); auto host_tensors = PrepareGemmTensorBF16(params);
const Tensor<BF16>& a_bf16 = std::get<0>(host_tensors); const Tensor<BF16>& a_bf16 = std::get<0>(host_tensors);
const Tensor<BF16>& b_bf16 = std::get<1>(host_tensors); const Tensor<BF16>& b_bf16 = std::get<1>(host_tensors);
Tensor<BF16>& c_device_bf16 = std::get<2>(host_tensors); Tensor<BF16>& c_device_bf16 = std::get<2>(host_tensors);
Tensor<float>& a_fp32 = std::get<3>(host_tensors); Tensor<float>& a_fp32 = std::get<3>(host_tensors);
Tensor<float>& b_fp32 = std::get<4>(host_tensors); Tensor<float>& b_fp32 = std::get<4>(host_tensors);
Tensor<float>& c_host_fp32 = std::get<5>(host_tensors); Tensor<float>& c_host_fp32 = std::get<5>(host_tensors);
Tensor<float>& c_device_fp32 = std::get<6>(host_tensors); Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
auto a_element_op = AElementwiseOperation{}; auto a_element_op = AElementwiseOperation{};
auto b_element_op = BElementwiseOperation{}; auto b_element_op = BElementwiseOperation{};
auto c_element_op = CElementwiseOperation{}; auto c_element_op = CElementwiseOperation{};
// use fp32 host kernel to verify bf16 device kernel // use fp32 host kernel to verify bf16 device kernel
using ReferenceGemmInstance = using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<float, ck::tensor_operation::host::ReferenceGemm<float,
float, float,
float, float,
AElementwiseOperation, AElementwiseOperation,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation>; CElementwiseOperation>;
ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>( ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op); a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
// Act // Act
ck::gemm_util::RunDeviceGEMM(gemmPtr, ck::gemm_util::RunDeviceGEMM(gemmPtr,
params, params,
a_bf16, a_bf16,
b_bf16, b_bf16,
c_device_bf16, c_device_bf16,
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op); c_element_op);
bf16_to_f32_(c_device_bf16, c_device_fp32); bf16_to_f32_(c_device_bf16, c_device_fp32);
// Assert // Assert
bool res = ck::utils::check_err( bool res = ck::utils::check_err(
c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f); c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res; return res;
}; };
}; };
} // namespace gemm_util } // namespace gemm_util
} // namespace ck } // namespace ck
#endif #endif
...@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return invariantDims; return invariantDims;
}; };
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template <typename InType>
struct type_mapping
{
using OutType = InType;
};
template <>
struct type_mapping<ck::half_t>
{
using OutType = half_float::half;
};
constexpr int Rank = 4; constexpr int Rank = 4;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
...@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method, ...@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
bool result = true; bool result = true;
using HostInDataType = typename type_mapping<InDataType>::OutType; ReductionHost<InDataType,
using HostOutDataType = typename type_mapping<OutDataType>::OutType; AccDataType,
using HostAccDataType = typename type_mapping<AccDataType>::OutType; OutDataType,
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId, ReduceOpId,
Rank, Rank,
NumReduceDim, NumReduceDim,
...@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method, ...@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
NeedIndices> NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
reinterpret_cast<const HostInDataType*>(in.mData.data()),
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
nullptr);
const auto i_inLengths = to_int_vector(inLengths); const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides); const auto i_inStrides = to_int_vector(inStrides);
......
...@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce ...@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return invariantDims; return invariantDims;
}; };
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template <typename InType>
struct type_mapping
{
using OutType = InType;
};
template <>
struct type_mapping<ck::half_t>
{
using OutType = half_float::half;
};
constexpr int Rank = 4; constexpr int Rank = 4;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX;
...@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method, ...@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
bool result = true; bool result = true;
using HostInDataType = typename type_mapping<InDataType>::OutType; ReductionHost<InDataType,
using HostOutDataType = typename type_mapping<OutDataType>::OutType; AccDataType,
using HostAccDataType = typename type_mapping<AccDataType>::OutType; OutDataType,
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId, ReduceOpId,
Rank, Rank,
NumReduceDim, NumReduceDim,
...@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method, ...@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
NeedIndices> NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(
reinterpret_cast<const HostInDataType*>(in.mData.data()), alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
const auto i_inLengths = to_int_vector(inLengths); const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides); const auto i_inStrides = to_int_vector(inStrides);
......
add_test_executable(test_reference_conv_fwd reference_conv_fwd.cpp) add_test_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor) target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util)
#include <algorithm>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
#include <half.hpp> #include <half.hpp>
...@@ -10,6 +9,7 @@ ...@@ -10,6 +9,7 @@
#include "config.hpp" #include "config.hpp"
#include "conv_fwd_util.hpp" #include "conv_fwd_util.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "fill.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "reference_conv_fwd.hpp" #include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
...@@ -19,35 +19,6 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; ...@@ -19,35 +19,6 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
template <typename T>
struct FillMonotonicSeq
{
T m_init_value{0};
T m_step{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, n = m_init_value]() mutable {
auto tmp = n;
n += m_step;
return tmp;
});
}
};
template <typename T>
struct FillConstant
{
T m_value{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, m_value);
}
};
template <ck::index_t NDim, template <ck::index_t NDim,
typename InDataType = float, typename InDataType = float,
typename WeiDataType = float, typename WeiDataType = float,
...@@ -55,8 +26,8 @@ template <ck::index_t NDim, ...@@ -55,8 +26,8 @@ template <ck::index_t NDim,
typename InLayout = ck::tensor_layout::convolution::NHWC, typename InLayout = ck::tensor_layout::convolution::NHWC,
typename WeiLayout = ck::tensor_layout::convolution::KYXC, typename WeiLayout = ck::tensor_layout::convolution::KYXC,
typename OutLayout = ck::tensor_layout::convolution::NHWK, typename OutLayout = ck::tensor_layout::convolution::NHWK,
typename FillInputOp = FillMonotonicSeq<InDataType>, typename FillInputOp = ck::utils::FillMonotonicSeq<InDataType>,
typename FillWeightsOp = FillConstant<WeiDataType>> typename FillWeightsOp = ck::utils::FillConstant<WeiDataType>>
Tensor<OutDataType> Tensor<OutDataType>
run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
const FillInputOp& fill_input_op = FillInputOp{}, const FillInputOp& fill_input_op = FillInputOp{},
...@@ -251,7 +222,7 @@ bool test_conv1d_nwc() ...@@ -251,7 +222,7 @@ bool test_conv1d_nwc()
ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>( ck::tensor_layout::convolution::NWK>(
params, FillMonotonicSeq<float>{0.f, 0.1f}); params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
ref_dims = std::vector<std::size_t>{2, 16, 16}; ref_dims = std::vector<std::size_t>{2, 16, 16};
ref_data = std::vector<float>{ ref_data = std::vector<float>{
...@@ -349,7 +320,7 @@ bool test_conv3d_ncdhw() ...@@ -349,7 +320,7 @@ bool test_conv3d_ncdhw()
ck::tensor_layout::convolution::NCDHW, ck::tensor_layout::convolution::NCDHW,
ck::tensor_layout::convolution::KCZYX, ck::tensor_layout::convolution::KCZYX,
ck::tensor_layout::convolution::NKDHW>( ck::tensor_layout::convolution::NKDHW>(
params, FillMonotonicSeq<float>{0.f, 0.1f}); params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
std::vector<std::size_t> ref_dims{1, 1, 4, 4, 4}; std::vector<std::size_t> ref_dims{1, 1, 4, 4, 4};
std::vector<float> ref_data{ std::vector<float> ref_data{
407.7, 410.40002, 413.09998, 415.80002, 423.90002, 426.6, 429.30002, 432., 407.7, 410.40002, 413.09998, 415.80002, 423.90002, 426.6, 429.30002, 432.,
...@@ -383,7 +354,7 @@ bool test_conv3d_ncdhw() ...@@ -383,7 +354,7 @@ bool test_conv3d_ncdhw()
ck::tensor_layout::convolution::NCDHW, ck::tensor_layout::convolution::NCDHW,
ck::tensor_layout::convolution::KCZYX, ck::tensor_layout::convolution::KCZYX,
ck::tensor_layout::convolution::NKDHW>( ck::tensor_layout::convolution::NKDHW>(
params, FillMonotonicSeq<float>{0.f, 0.1f}); params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
ref_dims = std::vector<std::size_t>{1, 2, 4, 4, 4}; ref_dims = std::vector<std::size_t>{1, 2, 4, 4, 4};
ref_data = std::vector<float>{ ref_data = std::vector<float>{
2756.7002, 2764.7998, 2772.9001, 2781., 2853.9001, 2862., 2870.1, 2878.2002, 2756.7002, 2764.7998, 2772.9001, 2781., 2853.9001, 2862., 2870.1, 2878.2002,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment