Commit e4e99a49 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use new utilities to shorten codes

parent 7acbf104
......@@ -3,25 +3,26 @@
#pragma once
#include "ck/ck.hpp"
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
namespace ck {
namespace profiler {
......@@ -30,16 +31,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
......@@ -88,9 +89,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight_host_result.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight_host_result.GetDesc() << std::endl;
std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method)
{
......@@ -104,13 +105,12 @@ bool profile_conv_bwd_weight_impl(int do_verification,
output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) *
weight_device_result.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(weight_device_result.GetMemorySize());
DeviceMem out_device_buf(output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data());
out_device_buf.ToDevice(output.mData.data());
in_device_buf.ToDevice(input.data());
out_device_buf.ToDevice(output.data());
if(do_verification)
{
......@@ -165,10 +165,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
conv_param.N_,
conv_param.K_,
conv_param.C_,
......@@ -215,10 +214,9 @@ bool profile_conv_bwd_weight_impl(int do_verification,
if(do_verification)
{
wei_device_buf.FromDevice(weight_device_result.mData.data());
wei_device_buf.FromDevice(weight_device_result.data());
bool pass =
ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
bool pass = ck::utils::check_err(weight_host_result, weight_device_result);
if(!pass)
{
......
......@@ -4,15 +4,16 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace tensor_operation {
......@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
}
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
}
};
......@@ -92,17 +93,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));
// residual: assume same layout as output tensor
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;
std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.GetDesc() << std::endl;
switch(init_method)
{
......@@ -157,17 +157,16 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
DeviceMem bias_device_buf(bias_k.GetMemorySize());
DeviceMem resi_device_buf(resi_n_k_ho_wo.GetMemorySize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
in_device_buf.ToDevice(in_n_c_hi_wi.data());
wei_device_buf.ToDevice(wei_k_c_y_x.data());
bias_device_buf.ToDevice(bias_k.data());
resi_device_buf.ToDevice(resi_n_k_ho_wo.data());
using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
......@@ -196,12 +195,11 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
// profile device Conv instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
bias_device_buf.GetDeviceBuffer(),
resi_device_buf.GetDeviceBuffer(),
N,
K,
C,
......@@ -225,7 +223,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype =
sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
......@@ -249,22 +247,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
if(do_verification)
{
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
out_n_k_ho_wo_host_result.mData);
ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
std::cout << "out_host : ", out_n_k_ho_wo_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
<< std::endl;
}
}
......
......@@ -4,15 +4,16 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace tensor_operation {
......@@ -66,21 +67,21 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
const ck::index_t Ho = output_spatial_lengths[0];
const ck::index_t Wo = output_spatial_lengths[1];
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
if constexpr(is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW> ||
is_same_v<decltype(layout), ck::tensor_layout::convolution::KCYX> ||
is_same_v<decltype(layout), ck::tensor_layout::convolution::NKHW>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
}
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
else if constexpr(is_same_v<decltype(layout), tensor_layout::convolution::NHWC> ||
is_same_v<decltype(layout), tensor_layout::convolution::KYXC> ||
is_same_v<decltype(layout), tensor_layout::convolution::NHWK>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
}
};
......@@ -92,13 +93,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
Tensor<OutDataType> bias_k(HostTensorDescriptor({K}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
std::cout << "bias_k: " << bias_k.mDesc << std::endl;
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.GetDesc() << std::endl;
std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.GetDesc() << std::endl;
std::cout << "bias_k: " << bias_k.GetDesc() << std::endl;
switch(init_method)
{
......@@ -149,15 +149,14 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem wei_device_buf(wei_k_c_y_x.GetMemorySize());
DeviceMem out_device_buf(out_n_k_ho_wo_device_result.GetMemorySize());
DeviceMem bias_device_buf(bias_k.GetMemorySize());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
bias_device_buf.ToDevice(bias_k.mData.data());
in_device_buf.ToDevice(in_n_c_hi_wi.data());
wei_device_buf.ToDevice(wei_k_c_y_x.data());
bias_device_buf.ToDevice(bias_k.data());
using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
......@@ -186,11 +185,10 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
// profile device Conv instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
bias_device_buf.GetDeviceBuffer(),
N,
K,
C,
......@@ -214,7 +212,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
std::size_t flop = 2_uz * N * K * Ho * Wo * C * Y * X;
std::size_t num_btype =
sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
......@@ -237,22 +235,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
if(do_verification)
{
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
out_n_k_ho_wo_host_result.mData);
ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x, ",") << std::endl;
LogRangeAsType<float>(
std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
std::cout << "out_host : ", out_n_k_ho_wo_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
std::cout << "out_device: ", out_n_k_ho_wo_device_result, ",")
<< std::endl;
}
}
......
......@@ -8,19 +8,19 @@
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck {
namespace profiler {
......@@ -60,9 +60,9 @@ bool profile_conv_fwd_impl(int do_verification,
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight.GetDesc() << std::endl;
std::cout << "output: " << host_output.GetDesc() << std::endl;
switch(init_method)
{
......@@ -76,12 +76,12 @@ bool profile_conv_fwd_impl(int do_verification,
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(weight.GetMemorySize());
DeviceMem out_device_buf(device_output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());
in_device_buf.ToDevice(input.data());
wei_device_buf.ToDevice(weight.data());
// run reference op
if(do_verification)
......@@ -139,10 +139,9 @@ bool profile_conv_fwd_impl(int do_verification,
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
conv_param.N_,
conv_param.K_,
conv_param.C_,
......@@ -189,17 +188,17 @@ bool profile_conv_fwd_impl(int do_verification,
if(do_verification)
{
out_device_buf.FromDevice(device_output.mData.data());
out_device_buf.FromDevice(device_output.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",")
LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
<< std::endl;
}
}
......
......@@ -4,16 +4,17 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/ranges.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -241,16 +242,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
......@@ -294,16 +295,16 @@ bool profile_convnd_bwd_data_impl(int do_verification,
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
input_dims.insert(
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
filter_dims.insert(std::end(filter_dims),
std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths));
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
......@@ -317,9 +318,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
Tensor<OutDataType> output(
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
std::cout << "input: " << input_host_result.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
std::cout << "input: " << input_host_result.GetDesc() << std::endl;
std::cout << "weights: " << weights.GetDesc() << std::endl;
std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method)
{
......@@ -333,12 +334,12 @@ bool profile_convnd_bwd_data_impl(int do_verification,
weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
}
DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
DeviceMem in_device_buf(input_device_result.GetMemorySize());
DeviceMem wei_device_buf(weights.GetMemorySize());
DeviceMem out_device_buf(output.GetMemorySize());
out_device_buf.ToDevice(output.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
out_device_buf.ToDevice(output.data());
wei_device_buf.ToDevice(weights.data());
// reset input to zero
in_device_buf.SetZero();
......@@ -391,10 +392,9 @@ bool profile_convnd_bwd_data_impl(int do_verification,
bool success = true;
for(auto& conv_ptr : conv_ptrs)
{
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
N,
K,
C,
......@@ -440,7 +440,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
if(do_verification)
{
in_device_buf.FromDevice(input_device_result.mData.data());
in_device_buf.FromDevice(input_device_result.data());
if(!check_out(input_host_result, input_device_result))
{
......@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
}
success = ck::utils::check_err(input_host_result.mData, input_device_result.mData);
success = ck::utils::check_err(input_host_result, input_device_result);
if(do_log)
{
......
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/ranges.hpp"
using F16 = ck::half_t;
using F32 = float;
......@@ -205,16 +206,16 @@ template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
std::cout << "[";
for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
for(int n = 0; n < ck::type_convert<int>(nhwc.GetLengths()[0]); n++)
{
std::cout << "[";
for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
for(int hi = 0; hi < ck::type_convert<int>(nhwc.GetLengths()[2]); hi++)
{
std::cout << "[";
for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
for(int wi = 0; wi < ck::type_convert<int>(nhwc.GetLengths()[3]); wi++)
{
std::cout << "[";
for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
for(int c = 0; c < ck::type_convert<int>(nhwc.GetLengths()[1]); c++)
{
std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << " ";
}
......@@ -258,16 +259,16 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
auto input_dims = ck::ranges::to<std::vector<std::size_t>>({N, C});
input_dims.insert(
std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
auto filter_dims = ck::ranges::to<std::vector<std::size_t>>({K, C});
filter_dims.insert(std::end(filter_dims),
std::begin(filter_spatial_lengths),
std::end(filter_spatial_lengths));
std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
auto output_dims = ck::ranges::to<std::vector<std::size_t>>({N, K});
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
......@@ -280,9 +281,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
Tensor<OutDataType> output(
get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights_host_result.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weights: " << weights_host_result.GetDesc() << std::endl;
std::cout << "output: " << output.GetDesc() << std::endl;
switch(init_method)
{
......@@ -296,12 +297,12 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(weights_device_result.GetMemorySize());
DeviceMem out_device_buf(output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data());
out_device_buf.ToDevice(output.mData.data());
in_device_buf.ToDevice(input.data());
out_device_buf.ToDevice(output.data());
// reset input to zero
wei_device_buf.SetZero();
......@@ -359,10 +360,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
// wei_device_buf.SetZero();
//}
auto argument_ptr = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
auto argument_ptr = conv_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
N,
K,
C,
......@@ -390,7 +390,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
std::string conv_name = conv_ptr->GetTypeString();
float ave_time = 0;
if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
if constexpr(std::is_same_v<InDataType, ck::bhalf_t> && split_k > 1)
{
// alloc work space
size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
......@@ -431,9 +431,9 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
if(do_verification)
{
wei_device_buf.FromDevice(weights_device_result.mData.data());
wei_device_buf.FromDevice(weights_device_result.data());
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
success = ck::utils::check_err(weights_host_result, weights_device_result);
if(success == false)
{
......
......@@ -6,17 +6,19 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
......@@ -45,17 +47,17 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
int StrideD1,
int StrideE)
{
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -66,11 +68,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d0_m_n: " << d0_m_n.GetDesc() << std::endl;
std::cout << "d1_m_n: " << d1_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -121,8 +123,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
......@@ -149,16 +150,16 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d0_m_n_device_buf(d0_m_n.GetMemorySize());
DeviceMem d1_m_n_device_buf(d1_m_n.GetMemorySize());
DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
d0_m_n_device_buf.ToDevice(d0_m_n.data());
d1_m_n_device_buf.ToDevice(d1_m_n.data());
std::string best_op_name;
float best_ave_time = 0;
......@@ -170,18 +171,18 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// profile device operation instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(),
auto argument_ptr =
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
d1_m_n_device_buf.GetDeviceBuffer()},
ck::utils::to_array({d0_m_n_device_buf.GetDeviceBuffer(),
d1_m_n_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 2>{StrideD0, StrideD1},
ck::utils::to_array({StrideD0, StrideD1}),
StrideE,
a_element_op,
b_element_op,
......@@ -199,7 +200,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
......@@ -221,10 +222,9 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
pass = pass &&
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
......
......@@ -4,17 +4,18 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace tensor_operation {
......@@ -74,22 +75,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideC,
int StrideD0)
{
using namespace ck::literals;
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}),
std::vector<std::size_t>({stride}));
return HostTensorDescriptor({len}, {stride});
};
auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -99,22 +99,18 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;
std::size_t num_thread = 1;
switch(init_method)
......@@ -217,23 +213,21 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
reduce0_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
DeviceMem bias_device_buf(bias_n.GetMemorySize());
DeviceMem d0_device_buf(d0_m_n.GetMemorySize());
DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
bias_device_buf.ToDevice(bias_n.mData.data());
d0_device_buf.ToDevice(d0_m_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
bias_device_buf.ToDevice(bias_n.data());
d0_device_buf.ToDevice(d0_m_n.data());
// add device GEMM instances
std::vector<ck::tensor_operation::device::instance::DeviceGemmBiasAddReduceNoOpPtr> gemm_ptrs;
......@@ -319,7 +313,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
std::string gemm_name = gemm_ptr->GetTypeString();
std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
std::size_t flop = 2_uz * M * N * K + 2_uz * M * N;
std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
......@@ -343,33 +337,29 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
c_device_buf.FromDevice(c_m_n_device_result.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
<< std::endl;
}
}
......
......@@ -6,17 +6,19 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
......@@ -44,17 +46,17 @@ bool profile_gemm_bilinear_impl(int do_verification,
float alpha,
float beta)
{
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -64,10 +66,10 @@ bool profile_gemm_bilinear_impl(int do_verification,
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -116,8 +118,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
Tensor<AccDataType> c_m_n(HostTensorDescriptor({M, N}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
......@@ -144,14 +145,14 @@ bool profile_gemm_bilinear_impl(int do_verification,
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_m_n_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d_m_n_device_buf.ToDevice(d_m_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
d_m_n_device_buf.ToDevice(d_m_n.data());
std::string best_op_name;
float best_ave_time = 0;
......@@ -163,17 +164,17 @@ bool profile_gemm_bilinear_impl(int do_verification,
// profile device operation instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(),
auto argument_ptr =
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_m_n_device_buf.GetDeviceBuffer()},
ck::utils::to_array({d_m_n_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD},
ck::utils::to_array({StrideD}),
StrideE,
a_element_op,
b_element_op,
......@@ -191,7 +192,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
......@@ -213,10 +214,9 @@ bool profile_gemm_bilinear_impl(int do_verification,
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
pass = pass &&
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
......
......@@ -8,17 +8,18 @@
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
......@@ -43,17 +44,17 @@ int profile_gemm_impl(int do_verification,
{
bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -62,9 +63,9 @@ int profile_gemm_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -86,12 +87,12 @@ int profile_gemm_impl(int do_verification,
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout,
BLayout,
......@@ -137,10 +138,9 @@ int profile_gemm_impl(int do_verification,
// profile device op instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
......@@ -163,7 +163,7 @@ int profile_gemm_impl(int do_verification,
float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
......@@ -185,18 +185,17 @@ int profile_gemm_impl(int do_verification,
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
c_device_buf.FromDevice(c_m_n_device_result.data());
pass =
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl;
}
}
......
......@@ -4,17 +4,18 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace tensor_operation {
......@@ -73,17 +74,17 @@ bool profile_gemm_reduce_impl(int do_verification,
{
bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -91,22 +92,18 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_host_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_host_result(HostTensorDescriptor({M}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<ReduceDataType> reduce0_m_device_result(HostTensorDescriptor({M}));
Tensor<ReduceDataType> reduce1_m_device_result(HostTensorDescriptor({M}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
std::cout << "reduce0_m: " << reduce0_m_host_result.GetDesc() << std::endl;
std::cout << "reduce1_m: " << reduce1_m_host_result.GetDesc() << std::endl;
std::size_t num_thread = 1;
switch(init_method)
......@@ -189,19 +186,17 @@ bool profile_gemm_reduce_impl(int do_verification,
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
reduce0_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
DeviceMem reduce0_device_buf(reduce0_m_device_result.GetMemorySize());
DeviceMem reduce1_device_buf(reduce1_m_device_result.GetMemorySize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
// add device GEMM instances
std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
......@@ -287,7 +282,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std::string gemm_name = gemm_ptr->GetTypeString();
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(CDataType) * N;
......@@ -309,33 +304,29 @@ bool profile_gemm_reduce_impl(int do_verification,
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
c_device_buf.FromDevice(c_m_n_device_result.data());
reduce0_device_buf.FromDevice(reduce0_m_device_result.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "d0_host: ", reduce0_m_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "d0_device: ", reduce0_m_device_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "d1_host: ", reduce1_m_host_result, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "d1_device: ", reduce1_m_device_result, ",")
<< std::endl;
}
}
......
......@@ -8,17 +8,18 @@
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
......@@ -44,17 +45,17 @@ bool profile_gemm_splitk_impl(int do_verification,
{
bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -63,9 +64,9 @@ bool profile_gemm_splitk_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -87,13 +88,13 @@ bool profile_gemm_splitk_impl(int do_verification,
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem c_device_buf(c_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
c_device_buf.ToDevice(c_m_n_device_result.data());
using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
BLayout,
......@@ -139,10 +140,9 @@ bool profile_gemm_splitk_impl(int do_verification,
// profile device GEMM instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
......@@ -166,7 +166,7 @@ bool profile_gemm_splitk_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
......@@ -188,18 +188,17 @@ bool profile_gemm_splitk_impl(int do_verification,
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
c_device_buf.FromDevice(c_m_n_device_result.data());
pass =
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "a : ", a_m_k, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result, ",")
<< std::endl;
}
}
......
......@@ -8,19 +8,21 @@
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck {
namespace profiler {
......@@ -66,7 +68,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
......@@ -84,9 +86,9 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "input: " << input.GetDesc() << std::endl;
std::cout << "weight: " << weight.GetDesc() << std::endl;
std::cout << "output: " << host_output.GetDesc() << std::endl;
switch(init_method)
{
......@@ -100,12 +102,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(input.GetMemorySize());
DeviceMem wei_device_buf(weight.GetMemorySize());
DeviceMem out_device_buf(device_output.GetMemorySize());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());
in_device_buf.ToDevice(input.data());
wei_device_buf.ToDevice(weight.data());
// run reference op
if(do_verification)
......@@ -163,19 +165,20 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// profile device op instances
bool pass = true;
using ck::utils::empty_array;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
empty_array(),
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
empty_array(),
empty_array(),
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
......@@ -218,17 +221,17 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
if(do_verification)
{
out_device_buf.FromDevice(device_output.mData.data());
out_device_buf.FromDevice(device_output.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",")
LogRangeAsType<float>(std::cout << "input : ", input, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
LogRangeAsType<float>(std::cout << "device_output: ", device_output, ",")
<< std::endl;
}
}
......
......@@ -6,18 +6,19 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace profiler {
......@@ -43,17 +44,17 @@ bool profile_grouped_gemm_impl(int do_verification,
bool pass = true;
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
if constexpr(is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -79,9 +80,9 @@ bool profile_grouped_gemm_impl(int do_verification,
c_m_n_device_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl;
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].GetDesc() << ", b_k_n["
<< i << "]:" << b_k_n[i].GetDesc() << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].GetDesc() << std::endl;
std::size_t num_thread = 1;
switch(init_method)
......@@ -132,17 +133,15 @@ bool profile_grouped_gemm_impl(int do_verification,
for(std::size_t i = 0; i < group_count; i++)
{
a_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
b_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
a_device_buf.emplace_back(std::make_unique<DeviceMem>(a_m_k[i].GetMemorySize()));
b_device_buf.emplace_back(std::make_unique<DeviceMem>(b_k_n[i].GetMemorySize()));
c_device_buf.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
c_device_buf.emplace_back(
std::make_unique<DeviceMem>(c_m_n_device_results[i].GetMemorySize()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
a_device_buf[i]->ToDevice(a_m_k[i].data());
b_device_buf[i]->ToDevice(b_k_n[i].data());
c_device_buf[i]->ToDevice(c_m_n_device_results[i].data());
gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
......@@ -207,7 +206,7 @@ bool profile_grouped_gemm_impl(int do_verification,
std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
flop += 2_uz * Ms[i] * Ns[i] * Ks[i];
num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
sizeof(CDataType) * Ms[i] * Ns[i];
......@@ -232,7 +231,7 @@ bool profile_grouped_gemm_impl(int do_verification,
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
c_device_buf[i]->FromDevice(c_m_n_device_results[i].data());
Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
......@@ -257,19 +256,16 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op);
ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
c_m_n_host_result.mData);
pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "a : ", a_m_k[i], ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n[i], ",") << std::endl;
LogRangeAsType<float>(
std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
std::cout << "c_device: ", c_m_n_device_results[i], ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "c_host : ", c_m_n_host_result.mData, ",")
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result, ",")
<< std::endl;
}
}
......
......@@ -9,11 +9,11 @@
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace ck {
namespace profiler {
......@@ -65,14 +65,14 @@ bool profile_groupnorm_impl(int do_verification,
beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
}
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem x_dev(x.GetMemorySize());
DeviceMem gamma_dev(gamma.GetMemorySize());
DeviceMem beta_dev(beta.GetMemorySize());
DeviceMem y_dev(y.GetMemorySize());
x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
x_dev.ToDevice(x.data());
gamma_dev.ToDevice(gamma.data());
beta_dev.ToDevice(beta.data());
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
......@@ -116,10 +116,10 @@ bool profile_groupnorm_impl(int do_verification,
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
length,
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
std::vector<ck::index_t>{x.GetStrides().begin(), x.GetStrides().end()},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
std::vector<ck::index_t>{y.GetStrides().begin(), y.GetStrides().end()},
reduce_dim,
1e-6,
x_dev.GetDeviceBuffer(),
......@@ -141,10 +141,10 @@ bool profile_groupnorm_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType);
std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
gamma.GetElementSize() * sizeof(GammaDataType) +
beta.GetElementSize() * sizeof(BetaDataType) +
y.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -161,16 +161,15 @@ bool profile_groupnorm_impl(int do_verification,
if(do_verification)
{
y_dev.FromDevice(y.mData.data());
y_dev.FromDevice(y.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "x : ", x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y, ",") << std::endl;
}
if(!pass)
......
......@@ -9,11 +9,11 @@
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace profiler {
......@@ -72,14 +72,14 @@ void profile_layernorm_impl(int do_verification,
y.GenerateTensorValue(GeneratorTensor_3<YDataType>{-0.5, 0.5});
}
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem x_dev(x.GetMemorySize());
DeviceMem gamma_dev(gamma.GetMemorySize());
DeviceMem beta_dev(beta.GetMemorySize());
DeviceMem y_dev(y.GetMemorySize());
x_dev.ToDevice(x.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
x_dev.ToDevice(x.data());
gamma_dev.ToDevice(gamma.data());
beta_dev.ToDevice(beta.data());
constexpr int NumReduceDim = Rank - 1;
......@@ -149,10 +149,10 @@ void profile_layernorm_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType);
std::size_t num_bytes = x.GetElementSize() * sizeof(XDataType) +
gamma.GetElementSize() * sizeof(GammaDataType) +
beta.GetElementSize() * sizeof(BetaDataType) +
y.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -168,16 +168,15 @@ void profile_layernorm_impl(int do_verification,
if(do_verification)
{
y_dev.FromDevice(y.mData.data());
y_dev.FromDevice(y.data());
bool pass = ck::utils::check_err(
y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "x : ", x.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "x : ", x, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y, ",") << std::endl;
}
if(!pass)
......
......@@ -6,15 +6,16 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
......@@ -87,7 +88,7 @@ void profile_normalization_impl(int do_verification,
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc);
Tensor<OutDataType> out(in.GetDesc());
switch(init_method)
{
......@@ -107,13 +108,13 @@ void profile_normalization_impl(int do_verification,
Tensor<OutDataType> out_ref(out);
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
in_dev.ToDevice(in.mData.data());
out_dev.ToDevice(out.mData.data());
DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.data());
out_dev.ToDevice(out.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
std::vector<index_t> i_in_lengths(in.GetLengths().begin(), in.GetLengths().end());
std::vector<index_t> i_in_strides(in.GetStrides().begin(), in.GetStrides().end());
// add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
......@@ -189,9 +190,8 @@ void profile_normalization_impl(int do_verification,
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes =
in.mDesc.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
std::size_t num_bytes = in.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -213,30 +213,27 @@ void profile_normalization_impl(int do_verification,
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data());
out_dev.FromDevice(out.data());
bool pass;
if(std::is_same<InDataType, int8_t>::value)
if constexpr(std::is_same_v<InDataType, int8_t>)
{
pass = ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
pass = ck::utils::check_err(out, out_ref, "Error: Incorrect results!", 0, 1);
if(do_log)
{
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out_ref : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<int>(std::cout << "out : ", out.mData, ",") << std::endl;
LogRangeAsType<int>(std::cout << "in : ", in, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out_ref : ", out_ref, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out : ", out, ",") << std::endl;
}
}
else
{
pass = ck::utils::check_err(out.mData, out_ref.mData);
pass = ck::utils::check_err(out, out_ref);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_ref : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "in : ", in, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_ref : ", out_ref, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out : ", out, ",") << std::endl;
}
}
......
......@@ -3,11 +3,13 @@
#pragma once
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp"
......@@ -214,11 +216,11 @@ bool profile_reduce_impl_impl(bool do_verification,
Tensor<int32_t> out_indices_ref(outLengths);
Tensor<int32_t> out_indices(outLengths);
auto inStrides = in.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
auto inStrides = in.GetStrides();
auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
......@@ -245,20 +247,21 @@ bool profile_reduce_impl_impl(bool do_verification,
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
{
ck::ranges::copy(out_ref, out.begin());
}
};
// these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.mData.data());
in_dev.ToDevice(in.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
out_dev.ToDevice(out.data());
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;
size_t indicesSizeInBytes = OutputIndex ? out.GetElementSize() * sizeof(int) : 0;
DeviceMem out_indices_dev(indicesSizeInBytes);
......@@ -331,13 +334,13 @@ bool profile_reduce_impl_impl(bool do_verification,
NumReduceDim,
PropagateNan,
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha,
in.mData.data(),
in.data(),
beta,
out_ref.mData.data(),
out_indices_ref.mData.data(),
out_ref.data(),
out_indices_ref.data(),
in_elementwise_op,
acc_elementwise_op);
};
......@@ -398,14 +401,13 @@ bool profile_reduce_impl_impl(bool do_verification,
{
bool single_pass;
out_dev.FromDevice(out.mData.data());
single_pass = ck::utils::check_err(out.mData, out_ref.mData);
out_dev.FromDevice(out.data());
single_pass = ck::utils::check_err(out, out_ref);
if(OutputIndex)
{
out_indices_dev.FromDevice(out_indices.mData.data());
single_pass = single_pass &&
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
out_indices_dev.FromDevice(out_indices.data());
single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
};
if(!single_pass)
......@@ -418,18 +420,16 @@ bool profile_reduce_impl_impl(bool do_verification,
if(do_dumpout)
{
dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
dumpBufferToFile(
"dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
dumpBufferToFile("dump_in.bin", in.data(), in.GetElementSize());
dumpBufferToFile("dump_out.bin", out.data(), out.GetElementSize());
dumpBufferToFile("dump_out_host.bin", out_ref.data(), out_ref.GetElementSize());
if(OutputIndex)
{
dumpBufferToFile("dump_indices.bin",
out_indices.mData.data(),
out_indices.mDesc.GetElementSize());
dumpBufferToFile(
"dump_indices.bin", out_indices.data(), out_indices.GetElementSize());
dumpBufferToFile("dump_indices_host.bin",
out_indices_ref.mData.data(),
out_indices_ref.mDesc.GetElementSize());
out_indices_ref.data(),
out_indices_ref.GetElementSize());
};
};
};
......
......@@ -98,8 +98,8 @@ TEST(Int4, CopyAsI8PositiveValue)
d_src_i4.ToDevice(h_src_i4.data());
copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE);
hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data());
......@@ -125,8 +125,8 @@ TEST(Int4, DISABLED_CopyAsI8NegativeValue)
d_src_i4.ToDevice(h_src_i4.data());
copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
copy<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE);
hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data());
......@@ -152,8 +152,8 @@ TEST(Int4, CopyAsI8NegativeValueStaticCast)
d_src_i4.ToDevice(h_src_i4.data());
copy_with_static_cast<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
copy_with_static_cast<<<1, 64>>>(static_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
static_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
SIZE);
hip_check_error(hipDeviceSynchronize());
d_dst_i8.FromDevice(h_dst_i8.data());
......
......@@ -5,11 +5,13 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
namespace ck {
namespace gemm_util {
......@@ -71,9 +73,9 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
DeviceMem a_m_k_device_buf(A.GetMemorySize());
DeviceMem b_k_n_device_buf(B.GetMemorySize());
DeviceMem c_m_n_device_buf(C.GetMemorySize());
auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr =
......@@ -92,10 +94,10 @@ bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
{
a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data());
a_m_k_device_buf.ToDevice(A.data());
b_k_n_device_buf.ToDevice(B.data());
invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data());
c_m_n_device_buf.FromDevice(C.data());
return true;
}
......@@ -124,17 +126,17 @@ struct TestGemm
{
auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
{
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -204,29 +206,29 @@ struct TestGemm
{
// Assert
bool res = false;
if(std::is_same<CDataType, float>::value)
if constexpr(std::is_same_v<CDataType, float>)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, ck::half_t>::value)
else if constexpr(std::is_same_v<CDataType, ck::half_t>)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, ck::bhalf_t>::value)
else if constexpr(std::is_same_v<CDataType, ck::bhalf_t>)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, int8_t>::value)
else if constexpr(std::is_same_v<CDataType, int8_t>)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, double>::value)
else if constexpr(std::is_same_v<CDataType, double>)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
res = ck::utils::check_err(c_device, c_host);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment