Commit e4e99a49 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use new utilities to shorten codes

parent 7acbf104
...@@ -3,25 +3,26 @@ ...@@ -3,25 +3,26 @@
#pragma once #pragma once
#include "ck/ck.hpp"
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp" #include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -30,16 +31,16 @@ template <typename DataType> ...@@ -30,16 +31,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc) void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{ {
std::cout << "["; std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++) for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{ {
std::cout << "["; std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++) for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{ {
std::cout << "["; std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++) for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{ {
std::cout << "["; std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++) for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{ {
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " "; std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
} }
...@@ -88,9 +89,9 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -88,9 +89,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc); Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc); Tensor<OutDataType> output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight_host_result.mDesc << std::endl; std::cout << "weight: " << weight_host_result.GetDesc() << std::endl;
std::cout << "output: " << output.mDesc << std::endl; std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -104,13 +105,12 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -104,13 +105,12 @@ bool profile_conv_bwd_weight_impl(int do_verification,
output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * DeviceMem wei_device_buf(weight_device_result.GetMemorySize());
weight_device_result.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(output.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(input.data());
out_device_buf.ToDevice(output.mData.data()); out_device_buf.ToDevice(output.data());
if(do_verification) if(do_verification)
{ {
...@@ -165,24 +165,23 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -165,24 +165,23 @@ bool profile_conv_bwd_weight_impl(int do_verification,
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), conv_param.N_,
conv_param.N_, conv_param.K_,
conv_param.K_, conv_param.C_,
conv_param.C_, conv_param.input_spatial_lengths_,
conv_param.input_spatial_lengths_, conv_param.filter_spatial_lengths_,
conv_param.filter_spatial_lengths_, conv_param.output_spatial_lengths_,
conv_param.output_spatial_lengths_, conv_param.conv_filter_strides_,
conv_param.conv_filter_strides_, conv_param.conv_filter_dilations_,
conv_param.conv_filter_dilations_, conv_param.input_left_pads_,
conv_param.input_left_pads_, conv_param.input_right_pads_,
conv_param.input_right_pads_, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op,
out_element_op, split_k);
split_k);
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -215,10 +214,9 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -215,10 +214,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
wei_device_buf.FromDevice(weight_device_result.mData.data()); wei_device_buf.FromDevice(weight_device_result.data());
bool pass = bool pass = ck::utils::check_err(weight_host_result, weight_device_result);
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
if(!pass) if(!pass)
{ {
......
...@@ -4,15 +4,16 @@ ...@@ -4,15 +4,16 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
const ck::index_t Ho = output_spatial_lengths[0]; const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1]; const ck::index_t Wo = output_spatial_lengths[1];
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,17 +93,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -92,17 +93,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
// residual: assume same layout as output tensor // residual: assume same layout as output tensor
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl; std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;
std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl; std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -157,17 +157,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -157,17 +157,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize()); DeviceMem bias_device_buf(bias_k.GetMemorySize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize()); DeviceMem resi_device_buf(resi_n_k_ho_wo.GetMemorySize());
DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); in_device_buf.ToDevice(in_n_c_hi_wi.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); wei_device_buf.ToDevice(wei_k_c_y_x.data());
bias_device_buf.ToDevice(bias_k.mData.data()); bias_device_buf.ToDevice(bias_k.data());
resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data()); resi_device_buf.ToDevice(resi_n_k_ho_wo.data());
using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device:: using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>; DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
...@@ -196,25 +195,24 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -196,25 +195,24 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), bias_device_buf.GetDeviceBuffer(),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()), resi_device_buf.GetDeviceBuffer(),
static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()), N,
N, K,
K, C,
C, input_spatial_lengths,
input_spatial_lengths, filter_spatial_lengths,
filter_spatial_lengths, output_spatial_lengths,
output_spatial_lengths, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -225,7 +223,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -225,7 +223,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = std::size_t num_btype =
sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) + sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
...@@ -249,22 +247,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -249,22 +247,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
<< std::endl; LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",") std::cout << "out_host : ", out_n_k_ho_wo_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",") std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -4,15 +4,16 @@ ...@@ -4,15 +4,16 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
const ck::index_t Ho = output_spatial_lengths[0]; const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1]; const ck::index_t Wo = output_spatial_lengths[1];
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,13 +93,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -92,13 +93,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl; std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -149,15 +149,14 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -149,15 +149,14 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize()); DeviceMem bias_device_buf(bias_k.GetMemorySize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); in_device_buf.ToDevice(in_n_c_hi_wi.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); wei_device_buf.ToDevice(wei_k_c_y_x.data());
bias_device_buf.ToDevice(bias_k.mData.data()); bias_device_buf.ToDevice(bias_k.data());
using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device:: using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>; DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
...@@ -186,24 +185,23 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -186,24 +185,23 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
// profile device Conv instances // profile device Conv instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), bias_device_buf.GetDeviceBuffer(),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()), N,
N, K,
K, C,
C, input_spatial_lengths,
input_spatial_lengths, filter_spatial_lengths,
filter_spatial_lengths, output_spatial_lengths,
output_spatial_lengths, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -214,7 +212,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -214,7 +212,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype = std::size_t num_btype =
sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) + sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
...@@ -237,22 +235,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -237,22 +235,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
<< std::endl; LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",") std::cout << "out_host : ", out_n_k_ho_wo_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",") std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -8,19 +8,19 @@ ...@@ -8,19 +8,19 @@
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -60,9 +60,9 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -60,9 +60,9 @@ bool profile_conv_fwd_impl(int do_verification,
Tensor<OutDataType> host_output(out_g_n_k_wos_desc); Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc); Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "weight: " << weight.GetDesc() << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "output: " << host_output.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -76,12 +76,12 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -76,12 +76,12 @@ bool profile_conv_fwd_impl(int do_verification,
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(weight.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(device_output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(input.data());
wei_device_buf.ToDevice(weight.mData.data()); wei_device_buf.ToDevice(weight.data());
// run reference op // run reference op
if(do_verification) if(do_verification)
...@@ -139,23 +139,22 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -139,23 +139,22 @@ bool profile_conv_fwd_impl(int do_verification,
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), conv_param.N_,
conv_param.N_, conv_param.K_,
conv_param.K_, conv_param.C_,
conv_param.C_, conv_param.input_spatial_lengths_,
conv_param.input_spatial_lengths_, conv_param.filter_spatial_lengths_,
conv_param.filter_spatial_lengths_, conv_param.GetOutputSpatialLengths(),
conv_param.GetOutputSpatialLengths(), conv_param.conv_filter_strides_,
conv_param.conv_filter_strides_, conv_param.conv_filter_dilations_,
conv_param.conv_filter_dilations_, conv_param.input_left_pads_,
conv_param.input_left_pads_, conv_param.input_right_pads_,
conv_param.input_right_pads_, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -189,17 +188,17 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -189,17 +188,17 @@ bool profile_conv_fwd_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",") LogRangeAsType<float>(std::cout << "host_output : ", host_output, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",") LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -4,16 +4,17 @@ ...@@ -4,16 +4,17 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp" #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/ranges.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -241,16 +242,16 @@ template <typename DataType> ...@@ -241,16 +242,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc) void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{ {
std::cout << "["; std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++) for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{ {
std::cout << "["; std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++) for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{ {
std::cout << "["; std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++) for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{ {
std::cout << "["; std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++) for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{ {
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " "; std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
} }
...@@ -294,16 +295,16 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -294,16 +295,16 @@ bool profile_convnd_bwd_data_impl(int do_verification,
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)}; auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
input_dims.insert( input_dims.insert(
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths)); std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)}; auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(filter_spatial_lengths), std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths)); std::end(filter_spatial_lengths));
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)}; auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths), std::begin(output_spatial_lengths),
std::end(output_spatial_lengths)); std::end(output_spatial_lengths));
...@@ -317,9 +318,9 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -317,9 +318,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
Tensor<OutDataType> output( Tensor<OutDataType> output(
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial)); get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
std::cout << "input: " << input_host_result.mDesc << std::endl; std::cout << "input: " << input_host_result.GetDesc() << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl; std::cout << "weights: " << weights.GetDesc() << std::endl;
std::cout << "output: " << output.mDesc << std::endl; std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -333,12 +334,12 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -333,12 +334,12 @@ bool profile_convnd_bwd_data_impl(int do_verification,
weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1}); weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace()); DeviceMem in_device_buf(input_device_result.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); DeviceMem wei_device_buf(weights.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); DeviceMem out_device_buf(output.GetMemorySize());
out_device_buf.ToDevice(output.mData.data()); out_device_buf.ToDevice(output.data());
wei_device_buf.ToDevice(weights.mData.data()); wei_device_buf.ToDevice(weights.data());
// reset input to zero // reset input to zero
in_device_buf.SetZero(); in_device_buf.SetZero();
...@@ -391,23 +392,22 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -391,23 +392,22 @@ bool profile_convnd_bwd_data_impl(int do_verification,
bool success = true; bool success = true;
for(auto& conv_ptr : conv_ptrs) for(auto& conv_ptr : conv_ptrs)
{ {
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), N,
N, K,
K, C,
C, input_spatial_lengths,
input_spatial_lengths, filter_spatial_lengths,
filter_spatial_lengths, output_spatial_lengths,
output_spatial_lengths, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
auto invoker_ptr = conv_ptr->MakeInvokerPointer(); auto invoker_ptr = conv_ptr->MakeInvokerPointer();
...@@ -440,7 +440,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -440,7 +440,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
in_device_buf.FromDevice(input_device_result.mData.data()); in_device_buf.FromDevice(input_device_result.data());
if(!check_out(input_host_result, input_device_result)) if(!check_out(input_host_result, input_device_result))
{ {
...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
success = ck::utils::check_err(input_host_result.mData, input_device_result.mData); success = ck::utils::check_err(input_host_result, input_device_result);
if(do_log) if(do_log)
{ {
......
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp" #include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/ranges.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -205,16 +206,16 @@ template <typename DataType> ...@@ -205,16 +206,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc) void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{ {
std::cout << "["; std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++) for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{ {
std::cout << "["; std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++) for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{ {
std::cout << "["; std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++) for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{ {
std::cout << "["; std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++) for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{ {
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " "; std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
} }
...@@ -258,16 +259,16 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -258,16 +259,16 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)}; auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
input_dims.insert( input_dims.insert(
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths)); std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)}; auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(filter_spatial_lengths), std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths)); std::end(filter_spatial_lengths));
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)}; auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths), std::begin(output_spatial_lengths),
std::end(output_spatial_lengths)); std::end(output_spatial_lengths));
...@@ -280,9 +281,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -280,9 +281,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
Tensor<OutDataType> output( Tensor<OutDataType> output(
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial)); get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weights: " << weights_host_result.mDesc << std::endl; std::cout << "weights: " << weights_host_result.GetDesc() << std::endl;
std::cout << "output: " << output.mDesc << std::endl; std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -296,12 +297,12 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -296,12 +297,12 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1}); output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace()); DeviceMem wei_device_buf(weights_device_result.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); DeviceMem out_device_buf(output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(input.data());
out_device_buf.ToDevice(output.mData.data()); out_device_buf.ToDevice(output.data());
// reset input to zero // reset input to zero
wei_device_buf.SetZero(); wei_device_buf.SetZero();
...@@ -359,24 +360,23 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -359,24 +360,23 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
// wei_device_buf.SetZero(); // wei_device_buf.SetZero();
//} //}
auto argument_ptr = conv_ptr->MakeArgumentPointer( auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), N,
N, K,
K, C,
C, input_spatial_lengths,
input_spatial_lengths, filter_spatial_lengths,
filter_spatial_lengths, output_spatial_lengths,
output_spatial_lengths, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op,
out_element_op, split_k);
split_k);
if(!conv_ptr->IsSupportedArgument(argument_ptr.get())) if(!conv_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -390,7 +390,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -390,7 +390,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
std::string conv_name = conv_ptr->GetTypeString(); std::string conv_name = conv_ptr->GetTypeString();
float ave_time = 0; float ave_time = 0;
if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1) if constexpr(std::is_same_v<InDataType, ck::bhalf_t> && split_k > 1)
{ {
// alloc work space // alloc work space
size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get()); size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
...@@ -431,9 +431,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -431,9 +431,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.data());
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); success = ck::utils::check_err(weights_host_result, weights_device_result);
if(success == false) if(success == false)
{ {
......
...@@ -6,17 +6,19 @@ ...@@ -6,17 +6,19 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -45,17 +47,17 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -45,17 +47,17 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
int StrideD1, int StrideD1,
int StrideE) int StrideE)
{ {
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -66,11 +68,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -66,11 +68,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl; std::cout << "d0_m_n: " << d0_m_n.GetDesc() << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl; std::cout << "d1_m_n: " << d1_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -121,8 +123,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -121,8 +123,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -149,16 +150,16 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -149,16 +150,16 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize()); DeviceMem d0_m_n_device_buf(d0_m_n.GetMemorySize());
DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize()); DeviceMem d1_m_n_device_buf(d1_m_n.GetMemorySize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data()); d0_m_n_device_buf.ToDevice(d0_m_n.data());
d1_m_n_device_buf.ToDevice(d1_m_n.mData.data()); d1_m_n_device_buf.ToDevice(d1_m_n.data());
std::string best_op_name; std::string best_op_name;
float best_ave_time = 0; float best_ave_time = 0;
...@@ -170,22 +171,22 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -170,22 +171,22 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// profile device operation instances // profile device operation instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr =
a_device_buf.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(), ck::utils::to_array({d0_m_n_device_buf.GetDeviceBuffer(),
d1_m_n_device_buf.GetDeviceBuffer()}, d1_m_n_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
StrideA, StrideA,
StrideB, StrideB,
std::array<ck::index_t, 2>{StrideD0, StrideD1}, ck::utils::to_array({StrideD0, StrideD1}),
StrideE, StrideE,
a_element_op, a_element_op,
b_element_op, b_element_op,
cde_element_op); cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -199,7 +200,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -199,7 +200,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
...@@ -221,10 +222,9 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -221,10 +222,9 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -4,17 +4,18 @@ ...@@ -4,17 +4,18 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -74,22 +75,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -74,22 +75,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideC, int StrideC,
int StrideD0) int StrideD0)
{ {
using namespace ck::literals;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}), return HostTensorDescriptor({len}, {stride});
std::vector<std::size_t>({stride}));
}; };
auto f_host_tensor_descriptor2d = auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -99,22 +99,18 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -99,22 +99,18 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl; std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl; std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
...@@ -217,23 +213,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -217,23 +213,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize()); DeviceMem bias_device_buf(bias_n.GetMemorySize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize()); DeviceMem d0_device_buf(d0_m_n.GetMemorySize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) * DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
reduce0_m_device_result.mDesc.GetElementSpaceSize()); DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpaceSize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()}; reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
bias_device_buf.ToDevice(bias_n.mData.data()); bias_device_buf.ToDevice(bias_n.data());
d0_device_buf.ToDevice(d0_m_n.mData.data()); d0_device_buf.ToDevice(d0_m_n.data());
// add device GEMM instances // add device GEMM instances
std::vector<ck::tensor_operation::device::instance::DeviceGemmBiasAddReduceNoOpPtr> gemm_ptrs; std::vector<ck::tensor_operation::device::instance::DeviceGemmBiasAddReduceNoOpPtr> gemm_ptrs;
...@@ -319,7 +313,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -319,7 +313,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N; std::size_t flop = 2_uz * M * N * K + 2_uz * M * N;
std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N + sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
...@@ -343,33 +337,29 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -343,33 +337,29 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",") LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -6,17 +6,19 @@ ...@@ -6,17 +6,19 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -44,17 +46,17 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -44,17 +46,17 @@ bool profile_gemm_bilinear_impl(int do_verification,
float alpha, float alpha,
float beta) float beta)
{ {
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -64,10 +66,10 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -64,10 +66,10 @@ bool profile_gemm_bilinear_impl(int do_verification,
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -116,8 +118,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -116,8 +118,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -144,14 +145,14 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -144,14 +145,14 @@ bool profile_gemm_bilinear_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); DeviceMem d_m_n_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
d_m_n_device_buf.ToDevice(d_m_n.mData.data()); d_m_n_device_buf.ToDevice(d_m_n.data());
std::string best_op_name; std::string best_op_name;
float best_ave_time = 0; float best_ave_time = 0;
...@@ -163,21 +164,21 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -163,21 +164,21 @@ bool profile_gemm_bilinear_impl(int do_verification,
// profile device operation instances // profile device operation instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr =
a_device_buf.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_m_n_device_buf.GetDeviceBuffer()}, ck::utils::to_array({d_m_n_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
StrideA, StrideA,
StrideB, StrideB,
std::array<ck::index_t, 1>{StrideD}, ck::utils::to_array({StrideD}),
StrideE, StrideE,
a_element_op, a_element_op,
b_element_op, b_element_op,
cde_element_op); cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -191,7 +192,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -191,7 +192,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
...@@ -213,10 +214,9 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -213,10 +214,9 @@ bool profile_gemm_bilinear_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -8,17 +8,18 @@ ...@@ -8,17 +8,18 @@
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp" #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -43,17 +44,17 @@ int profile_gemm_impl(int do_verification, ...@@ -43,17 +44,17 @@ int profile_gemm_impl(int do_verification,
{ {
bool pass = true; bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -62,9 +63,9 @@ int profile_gemm_impl(int do_verification, ...@@ -62,9 +63,9 @@ int profile_gemm_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -86,12 +87,12 @@ int profile_gemm_impl(int do_verification, ...@@ -86,12 +87,12 @@ int profile_gemm_impl(int do_verification,
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout, using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout,
BLayout, BLayout,
...@@ -137,19 +138,18 @@ int profile_gemm_impl(int do_verification, ...@@ -137,19 +138,18 @@ int profile_gemm_impl(int do_verification,
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), c_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), M,
M, N,
N, K,
K, StrideA,
StrideA, StrideB,
StrideB, StrideC,
StrideC, a_element_op,
a_element_op, b_element_op,
b_element_op, c_element_op);
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -163,7 +163,7 @@ int profile_gemm_impl(int do_verification, ...@@ -163,7 +163,7 @@ int profile_gemm_impl(int do_verification,
float avg_time = float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
...@@ -185,18 +185,17 @@ int profile_gemm_impl(int do_verification, ...@@ -185,18 +185,17 @@ int profile_gemm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result.mData, ",") LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -4,17 +4,18 @@ ...@@ -4,17 +4,18 @@
#pragma once #pragma once
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -73,17 +74,17 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -73,17 +74,17 @@ bool profile_gemm_reduce_impl(int do_verification,
{ {
bool pass = true; bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -91,22 +92,18 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -91,22 +92,18 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl; std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl; std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
...@@ -189,19 +186,17 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -189,19 +186,17 @@ bool profile_gemm_reduce_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) * DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
reduce0_m_device_result.mDesc.GetElementSpaceSize()); DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpaceSize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()}; reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
// add device GEMM instances // add device GEMM instances
std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs; std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
...@@ -287,7 +282,7 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -287,7 +282,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(CDataType) * N; sizeof(CDataType) * M * N + sizeof(CDataType) * N;
...@@ -309,33 +304,29 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -309,33 +304,29 @@ bool profile_gemm_reduce_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",") LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -8,17 +8,18 @@ ...@@ -8,17 +8,18 @@
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -44,17 +45,17 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -44,17 +45,17 @@ bool profile_gemm_splitk_impl(int do_verification,
{ {
bool pass = true; bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -63,9 +64,9 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -63,9 +64,9 @@ bool profile_gemm_splitk_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -87,13 +88,13 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -87,13 +88,13 @@ bool profile_gemm_splitk_impl(int do_verification,
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
c_device_buf.ToDevice(c_m_n_device_result.mData.data()); c_device_buf.ToDevice(c_m_n_device_result.data());
using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout, using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
BLayout, BLayout,
...@@ -139,20 +140,19 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -139,20 +140,19 @@ bool profile_gemm_splitk_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), c_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), M,
M, N,
N, K,
K, StrideA,
StrideA, StrideB,
StrideB, StrideC,
StrideC, a_element_op,
a_element_op, b_element_op,
b_element_op, c_element_op,
c_element_op, KBatch);
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -166,7 +166,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -166,7 +166,7 @@ bool profile_gemm_splitk_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
...@@ -188,18 +188,17 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -188,18 +188,17 @@ bool profile_gemm_splitk_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result.mData, ",") LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -8,19 +8,21 @@ ...@@ -8,19 +8,21 @@
#include <typeinfo> #include <typeinfo>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -66,7 +68,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -66,7 +68,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -84,9 +86,9 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -84,9 +86,9 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
Tensor<OutDataType> host_output(out_g_n_k_wos_desc); Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc); Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "weight: " << weight.GetDesc() << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "output: " << host_output.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -100,12 +102,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -100,12 +102,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
} }
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(weight.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(device_output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data()); in_device_buf.ToDevice(input.data());
wei_device_buf.ToDevice(weight.mData.data()); wei_device_buf.ToDevice(weight.data());
// run reference op // run reference op
if(do_verification) if(do_verification)
...@@ -163,28 +165,29 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -163,28 +165,29 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
using ck::utils::empty_array;
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(), wei_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(), empty_array(),
std::array<const void*, 0>{}, out_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), a_g_n_c_wis_lengths,
a_g_n_c_wis_lengths, a_g_n_c_wis_strides,
a_g_n_c_wis_strides, b_g_k_c_xs_lengths,
b_g_k_c_xs_lengths, b_g_k_c_xs_strides,
b_g_k_c_xs_strides, empty_array(),
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, empty_array(),
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, e_g_n_k_wos_lengths,
e_g_n_k_wos_lengths, e_g_n_k_wos_strides,
e_g_n_k_wos_strides, conv_filter_strides,
conv_filter_strides, conv_filter_dilations,
conv_filter_dilations, input_left_pads,
input_left_pads, input_right_pads,
input_right_pads, in_element_op,
in_element_op, wei_element_op,
wei_element_op, out_element_op);
out_element_op);
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -218,17 +221,17 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -218,17 +221,17 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",") LogRangeAsType<float>(std::cout << "host_output : ", host_output, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",") LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -6,18 +6,19 @@ ...@@ -6,18 +6,19 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -43,17 +44,17 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -43,17 +44,17 @@ bool profile_grouped_gemm_impl(int do_verification,
bool pass = true; bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -79,9 +80,9 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -79,9 +80,9 @@ bool profile_grouped_gemm_impl(int do_verification,
c_m_n_device_results.push_back( c_m_n_device_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].GetDesc() << ", b_k_n["
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << i << "]:" << b_k_n[i].GetDesc() << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl; << "]:" << c_m_n_device_results[i].GetDesc() << std::endl;
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
...@@ -132,17 +133,15 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -132,17 +133,15 @@ bool profile_grouped_gemm_impl(int do_verification,
for(std::size_t i = 0; i < group_count; i++) for(std::size_t i = 0; i < group_count; i++)
{ {
a_device_buf.emplace_back( a_device_buf.emplace_back(std::make_unique<DeviceMem>(a_m_k[i].GetMemorySize()));
std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize())); b_device_buf.emplace_back(std::make_unique<DeviceMem>(b_k_n[i].GetMemorySize()));
b_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
c_device_buf.emplace_back(std::make_unique<DeviceMem>( c_device_buf.emplace_back(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize())); std::make_unique<DeviceMem>(c_m_n_device_results[i].GetMemorySize()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data()); a_device_buf[i]->ToDevice(a_m_k[i].data());
b_device_buf[i]->ToDevice(b_k_n[i].mData.data()); b_device_buf[i]->ToDevice(b_k_n[i].data());
c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data()); c_device_buf[i]->ToDevice(c_m_n_device_results[i].data());
gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}}); gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
...@@ -207,7 +206,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -207,7 +206,7 @@ bool profile_grouped_gemm_impl(int do_verification,
std::size_t flop = 0, num_btype = 0; std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; flop += 2_uz * Ms[i] * Ns[i] * Ks[i];
num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] + num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
sizeof(CDataType) * Ms[i] * Ns[i]; sizeof(CDataType) * Ms[i] * Ns[i];
...@@ -232,7 +231,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -232,7 +231,7 @@ bool profile_grouped_gemm_impl(int do_verification,
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); c_device_buf[i]->FromDevice(c_m_n_device_results[i].data());
Tensor<CDataType> c_m_n_host_result( Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})); f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
...@@ -257,19 +256,16 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -257,19 +256,16 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData, pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",") LogRangeAsType<float>(std::cout << "a : ", a_m_k[i], ",") << std::endl;
<< std::endl; LogRangeAsType<float>(std::cout << "b: ", b_k_n[i], ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "c_device: ", c_m_n_device_results[i].mData, ",") std::cout << "c_device: ", c_m_n_device_results[i], ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
std::cout << "c_host : ", c_m_n_host_result.mData, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -9,11 +9,11 @@ ...@@ -9,11 +9,11 @@
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -65,14 +65,14 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -65,14 +65,14 @@ bool profile_groupnorm_impl(int do_verification,
beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5}); beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
} }
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); DeviceMem x_dev(x.GetMemorySize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); DeviceMem gamma_dev(gamma.GetMemorySize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); DeviceMem beta_dev(beta.GetMemorySize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); DeviceMem y_dev(y.GetMemorySize());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.data());
gamma_dev.ToDevice(gamma.mData.data()); gamma_dev.ToDevice(gamma.data());
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.data());
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
...@@ -116,10 +116,10 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -116,10 +116,10 @@ bool profile_groupnorm_impl(int do_verification,
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
length, length,
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, std::vector<ck::index_t>{x.GetStrides().begin(), x.GetStrides().end()},
gammaBetaStride, gammaBetaStride,
gammaBetaStride, gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, std::vector<ck::index_t>{y.GetStrides().begin(), y.GetStrides().end()},
reduce_dim, reduce_dim,
1e-6, 1e-6,
x_dev.GetDeviceBuffer(), x_dev.GetDeviceBuffer(),
...@@ -141,10 +141,10 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -141,10 +141,10 @@ bool profile_groupnorm_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) + std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + gamma.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) + beta.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType); y.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -161,16 +161,15 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -161,16 +161,15 @@ bool profile_groupnorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.data());
bool pass = bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "x : ", x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "host_y : ", host_y, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "y : ", y, ",") << std::endl;
} }
if(!pass) if(!pass)
......
...@@ -9,11 +9,11 @@ ...@@ -9,11 +9,11 @@
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -72,14 +72,14 @@ void profile_layernorm_impl(int do_verification, ...@@ -72,14 +72,14 @@ void profile_layernorm_impl(int do_verification,
y.GenerateTensorValue(GeneratorTensor_3<YDataType>{-0.5, 0.5}); y.GenerateTensorValue(GeneratorTensor_3<YDataType>{-0.5, 0.5});
} }
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); DeviceMem x_dev(x.GetMemorySize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); DeviceMem gamma_dev(gamma.GetMemorySize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); DeviceMem beta_dev(beta.GetMemorySize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); DeviceMem y_dev(y.GetMemorySize());
x_dev.ToDevice(x.mData.data()); x_dev.ToDevice(x.data());
gamma_dev.ToDevice(gamma.mData.data()); gamma_dev.ToDevice(gamma.data());
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.data());
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
...@@ -149,10 +149,10 @@ void profile_layernorm_impl(int do_verification, ...@@ -149,10 +149,10 @@ void profile_layernorm_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) + std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + gamma.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) + beta.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType); y.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -168,16 +168,15 @@ void profile_layernorm_impl(int do_verification, ...@@ -168,16 +168,15 @@ void profile_layernorm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
y_dev.FromDevice(y.mData.data()); y_dev.FromDevice(y.data());
bool pass = ck::utils::check_err( bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "x : ", x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "host_y : ", host_y, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "y : ", y, ",") << std::endl;
} }
if(!pass) if(!pass)
......
...@@ -6,15 +6,16 @@ ...@@ -6,15 +6,16 @@
#include <iomanip> #include <iomanip>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -87,7 +88,7 @@ void profile_normalization_impl(int do_verification, ...@@ -87,7 +88,7 @@ void profile_normalization_impl(int do_verification,
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length) Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides); : Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc); Tensor<OutDataType> out(in.GetDesc());
switch(init_method) switch(init_method)
{ {
...@@ -107,13 +108,13 @@ void profile_normalization_impl(int do_verification, ...@@ -107,13 +108,13 @@ void profile_normalization_impl(int do_verification,
Tensor<OutDataType> out_ref(out); Tensor<OutDataType> out_ref(out);
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); std::vector<index_t> i_in_lengths(in.GetLengths().begin(), in.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); std::vector<index_t> i_in_strides(in.GetStrides().begin(), in.GetStrides().end());
// add device softmax instances // add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
...@@ -189,9 +190,8 @@ void profile_normalization_impl(int do_verification, ...@@ -189,9 +190,8 @@ void profile_normalization_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = std::size_t num_bytes = in.GetElementSize() * sizeof(InDataType) +
in.mDesc.GetElementSize() * sizeof(InDataType) + (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -213,30 +213,27 @@ void profile_normalization_impl(int do_verification, ...@@ -213,30 +213,27 @@ void profile_normalization_impl(int do_verification,
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims}); ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.data());
bool pass; bool pass;
if(std::is_same<InDataType, int8_t>::value) if constexpr(std::is_same_v<InDataType, int8_t>)
{ {
pass = ck::utils::check_err( pass = ck::utils::check_err(out, out_ref, "Error: Incorrect results!", 0, 1);
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
if(do_log) if(do_log)
{ {
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<int>(std::cout << "in : ", in, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out_ref : ", out_ref.mData, ",") LogRangeAsType<int>(std::cout << "out_ref : ", out_ref, ",") << std::endl;
<< std::endl; LogRangeAsType<int>(std::cout << "out : ", out, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out : ", out.mData, ",") << std::endl;
} }
} }
else else
{ {
pass = ck::utils::check_err(out.mData, out_ref.mData); pass = ck::utils::check_err(out, out_ref);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_ref : ", out_ref.mData, ",") LogRangeAsType<float>(std::cout << "out_ref : ", out_ref, ",") << std::endl;
<< std::endl; LogRangeAsType<float>(std::cout << "out : ", out, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
} }
} }
......
...@@ -3,11 +3,13 @@ ...@@ -3,11 +3,13 @@
#pragma once #pragma once
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
...@@ -214,11 +216,11 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -214,11 +216,11 @@ bool profile_reduce_impl_impl(bool do_verification,
Tensor<int32_t> out_indices_ref(outLengths); Tensor<int32_t> out_indices_ref(outLengths);
Tensor<int32_t> out_indices(outLengths); Tensor<int32_t> out_indices(outLengths);
auto inStrides = in.mDesc.GetStrides(); auto inStrides = in.GetStrides();
auto outStrides = out.mDesc.GetStrides(); auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1; std::size_t num_thread = 1;
...@@ -245,20 +247,21 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -245,20 +247,21 @@ bool profile_reduce_impl_impl(bool do_verification,
} }
if(beta != 0.0f) if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) {
out.mData[i] = out_ref.mData[i]; ck::ranges::copy(out_ref, out.begin());
}
}; };
// these buffers are usually provided by the user application // these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
if(beta != 0.0f) if(beta != 0.0f)
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.data());
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0; size_t indicesSizeInBytes = OutputIndex ? out.GetElementSize() * sizeof(int) : 0;
DeviceMem out_indices_dev(indicesSizeInBytes); DeviceMem out_indices_dev(indicesSizeInBytes);
...@@ -331,13 +334,13 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -331,13 +334,13 @@ bool profile_reduce_impl_impl(bool do_verification,
NumReduceDim, NumReduceDim,
PropagateNan, PropagateNan,
OutputIndex> OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha,
in.mData.data(), in.data(),
beta, beta,
out_ref.mData.data(), out_ref.data(),
out_indices_ref.mData.data(), out_indices_ref.data(),
in_elementwise_op, in_elementwise_op,
acc_elementwise_op); acc_elementwise_op);
}; };
...@@ -398,14 +401,13 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -398,14 +401,13 @@ bool profile_reduce_impl_impl(bool do_verification,
{ {
bool single_pass; bool single_pass;
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData); single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_indices_dev.FromDevice(out_indices.mData.data()); out_indices_dev.FromDevice(out_indices.data());
single_pass = single_pass && single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
}; };
if(!single_pass) if(!single_pass)
...@@ -418,18 +420,16 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -418,18 +420,16 @@ bool profile_reduce_impl_impl(bool do_verification,
if(do_dumpout) if(do_dumpout)
{ {
dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize()); dumpBufferToFile("dump_in.bin", in.data(), in.GetElementSize());
dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize()); dumpBufferToFile("dump_out.bin", out.data(), out.GetElementSize());
dumpBufferToFile( dumpBufferToFile("dump_out_host.bin", out_ref.data(), out_ref.GetElementSize());
"dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
if(OutputIndex) if(OutputIndex)
{ {
dumpBufferToFile("dump_indices.bin", dumpBufferToFile(
out_indices.mData.data(), "dump_indices.bin", out_indices.data(), out_indices.GetElementSize());
out_indices.mDesc.GetElementSize());
dumpBufferToFile("dump_indices_host.bin", dumpBufferToFile("dump_indices_host.bin",
out_indices_ref.mData.data(), out_indices_ref.data(),
out_indices_ref.mDesc.GetElementSize()); out_indices_ref.GetElementSize());
}; };
}; };
}; };
......
...@@ -98,8 +98,8 @@ TEST(Int4, CopyAsI8PositiveValue) ...@@ -98,8 +98,8 @@ TEST(Int4, CopyAsI8PositiveValue)
d_src_i4.ToDevice(h_src_i4.data()); d_src_i4.ToDevice(h_src_i4.data());
copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()), copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()), static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE); SIZE);
hip_check_error(hipDeviceSynchronize()); hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data()); d_dst_i8.FromDevice(h_dst_i8.data());
...@@ -125,8 +125,8 @@ TEST(Int4, DISABLED_CopyAsI8NegativeValue) ...@@ -125,8 +125,8 @@ TEST(Int4, DISABLED_CopyAsI8NegativeValue)
d_src_i4.ToDevice(h_src_i4.data()); d_src_i4.ToDevice(h_src_i4.data());
copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()), copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()), static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE); SIZE);
hip_check_error(hipDeviceSynchronize()); hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data()); d_dst_i8.FromDevice(h_dst_i8.data());
...@@ -152,8 +152,8 @@ TEST(Int4, CopyAsI8NegativeValueStaticCast) ...@@ -152,8 +152,8 @@ TEST(Int4, CopyAsI8NegativeValueStaticCast)
d_src_i4.ToDevice(h_src_i4.data()); d_src_i4.ToDevice(h_src_i4.data());
copy_with_static_cast<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()), copy_with_static_cast<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()), static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE); SIZE);
hip_check_error(hipDeviceSynchronize()); hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data()); d_dst_i8.FromDevice(h_dst_i8.data());
......
...@@ -5,11 +5,13 @@ ...@@ -5,11 +5,13 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
namespace gemm_util { namespace gemm_util {
...@@ -71,9 +73,9 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, ...@@ -71,9 +73,9 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op) CElementwiseOperation c_element_op)
{ {
DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(A.GetMemorySize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(B.GetMemorySize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(C.GetMemorySize());
auto invoker_ptr = gemmPtr->MakeInvokerPointer(); auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr = auto argument_ptr =
...@@ -92,10 +94,10 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, ...@@ -92,10 +94,10 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
if(gemmPtr->IsSupportedArgument(argument_ptr.get())) if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
{ {
a_m_k_device_buf.ToDevice(A.mData.data()); a_m_k_device_buf.ToDevice(A.data());
b_k_n_device_buf.ToDevice(B.mData.data()); b_k_n_device_buf.ToDevice(B.data());
invoker_ptr->Run(argument_ptr.get()); invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data()); c_m_n_device_buf.FromDevice(C.data());
return true; return true;
} }
...@@ -124,17 +126,17 @@ struct TestGemm ...@@ -124,17 +126,17 @@ struct TestGemm
{ {
auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params) auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
{ {
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -204,29 +206,29 @@ struct TestGemm ...@@ -204,29 +206,29 @@ struct TestGemm
{ {
// Assert // Assert
bool res = false; bool res = false;
if(std::is_same<CDataType, float>::value) if constexpr(std::is_same_v<CDataType, float>)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, ck::half_t>::value) else if constexpr(std::is_same_v<CDataType, ck::half_t>)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, ck::bhalf_t>::value) else if constexpr(std::is_same_v<CDataType, ck::bhalf_t>)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, int8_t>::value) else if constexpr(std::is_same_v<CDataType, int8_t>)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, double>::value) else if constexpr(std::is_same_v<CDataType, double>)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment