"docs/source/en/api/pipelines/stable_diffusion/depth2img.md" did not exist on "a69754bb879ed55b9b6dc9dd0b3cf4fa4124c765"
Unverified Commit 9b3365e1 authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Merge branch 'develop' into gridwise_2d

parents 9608beee b79bbbc2
...@@ -262,6 +262,16 @@ rocm_package_setup_component(tests ...@@ -262,6 +262,16 @@ rocm_package_setup_component(tests
PACKAGE_NAME tests # Prevent -static suffix on package name PACKAGE_NAME tests # Prevent -static suffix on package name
) )
rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler
)
add_subdirectory(library) add_subdirectory(library)
add_subdirectory(example) add_subdirectory(example)
add_subdirectory(test) add_subdirectory(test)
......
add_executable(client_conv2d_fwd conv2d_fwd.cpp)
target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 256;
static constexpr ck::index_t K = 192;
static constexpr ck::index_t C = 192;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 28;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
PassThrough,
PassThrough,
PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * G * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
add_executable(client_elementwise_layernorm2d elementwise_layernorm2d.cpp)
target_link_libraries(client_elementwise_layernorm2d PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
using ADataType = ck::half_t; // Input 1
using BDataType = ck::half_t; // Input 2
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t;
using YDataType = ck::half_t;
using AccDataType = float;
using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
constexpr int Rank = 2;
constexpr int NumReduceDim = 1;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
bool time_kernel = true;
ck::index_t M = 48 * 256;
ck::index_t N = 1024;
ck::index_t Stride = N;
auto mn_size = (M - 1) * Stride + N;
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * mn_size);
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * mn_size);
SimpleDeviceMem gamma_dev_buf(sizeof(GammaDataType) * N);
SimpleDeviceMem beta_dev_buf(sizeof(BetaDataType) * N);
SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);
std::array<const void*, 2> ab_input = {a_dev_buf.GetDeviceBuffer(),
b_dev_buf.GetDeviceBuffer()};
std::vector<ck::index_t> abStride = {Stride, 1};
std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};
using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
ck::Tuple<ADataType, BDataType>,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
XElementwiseOperation,
YElementwiseOperation,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
abStrides,
{0, 1}, // gammaStrides
{0, 1}, // betaStrides
{Stride, 1}, // yStrides
{1}, // reduceDims
1e-4,
ab_input,
gamma_dev_buf.GetDeviceBuffer(),
beta_dev_buf.GetDeviceBuffer(),
y_dev_buf.GetDeviceBuffer(),
XElementwiseOperation{},
YElementwiseOperation{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(ADataType) * M * N + sizeof(BDataType) * M * N +
sizeof(GammaDataType) * N + sizeof(BetaDataType) * N +
sizeof(YDataType) * M * N;
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
abStrides,
{1}, // gammaStrides
{1}, // betaStrides
{Stride, 1}, // yStrides
{1}, // reduceDims
1e-4,
ab_input,
gamma_dev_buf.GetDeviceBuffer(),
beta_dev_buf.GetDeviceBuffer(),
y_dev_buf.GetDeviceBuffer(),
XElementwiseOperation{},
YElementwiseOperation{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
...@@ -98,16 +98,18 @@ int main() ...@@ -98,16 +98,18 @@ int main()
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdData<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
InLayout, OutLayout,
WeiLayout, WeiLayout,
OutLayout, ck::Tuple<>,
InDataType, InLayout,
WeiDataType, OutDataType,
OutDataType, WeiDataType,
PassThrough, ck::Tuple<>,
PassThrough, InDataType,
PassThrough>; PassThrough,
PassThrough,
PassThrough>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
...@@ -126,15 +128,18 @@ int main() ...@@ -126,15 +128,18 @@ int main()
for(int i = 0; i < op_ptrs.size(); ++i) for(int i = 0; i < op_ptrs.size(); ++i)
{ {
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {},
in_lengths, in.GetDeviceBuffer(),
in_strides,
wei_lengths,
wei_strides,
out_lengths, out_lengths,
out_strides, out_strides,
wei_lengths,
wei_strides,
{},
{},
in_lengths,
in_strides,
filter_strides, filter_strides,
filter_dilations, filter_dilations,
input_left_pads, input_left_pads,
...@@ -189,15 +194,18 @@ int main() ...@@ -189,15 +194,18 @@ int main()
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {},
in_lengths, in.GetDeviceBuffer(),
in_strides,
wei_lengths,
wei_strides,
out_lengths, out_lengths,
out_strides, out_strides,
wei_lengths,
wei_strides,
{},
{},
in_lengths,
in_strides,
filter_strides, filter_strides,
filter_dilations, filter_dilations,
input_left_pads, input_left_pads,
......
add_executable(client_grouped_conv2d_bwd_weight grouped_conv2d_bwd_weight.cpp)
target_link_libraries(client_grouped_conv2d_bwd_weight PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <vector> #include <iterator>
#include <numeric>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...@@ -15,21 +17,22 @@ using InDataType = ck::half_t; ...@@ -15,21 +17,22 @@ using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::NHWC; using InLayout = ck::tensor_layout::convolution::GNHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK; using OutLayout = ck::tensor_layout::convolution::GNHWK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t N = 16; static constexpr ck::index_t G = 32;
static constexpr ck::index_t K = 32; static constexpr ck::index_t N = 256;
static constexpr ck::index_t C = 3; static constexpr ck::index_t K = 192;
static constexpr ck::index_t C = 192;
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3;
static constexpr ck::index_t Hi = 224; static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 224; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 113; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 113; static constexpr ck::index_t Wo = 28;
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -47,30 +50,33 @@ struct SimpleDeviceMem ...@@ -47,30 +50,33 @@ struct SimpleDeviceMem
void* p_mem_; void* p_mem_;
}; };
int main(int argc, char* argv[]) int main()
{ {
std::vector<ck::index_t> in_spatial_lengths{Hi, Wi}; std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
std::vector<ck::index_t> filter_spatial_lengths{Y, X}; std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
std::vector<ck::index_t> out_spatial_lengths{Ho, Wo}; std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
std::vector<ck::index_t> filter_strides{2, 2};
std::vector<ck::index_t> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
std::vector<ck::index_t> input_left_pads{2, 2}; std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
std::vector<ck::index_t> input_right_pads{2, 2}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); ck::index_t split_k = 2;
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial, SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
InLayout, SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
WeiLayout,
OutLayout, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
InDataType, InLayout,
WeiDataType, WeiLayout,
OutDataType, OutLayout,
PassThrough, InDataType,
PassThrough, WeiDataType,
PassThrough>; OutDataType,
PassThrough,
PassThrough,
PassThrough>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances(); DeviceOp>::GetInstances();
...@@ -92,19 +98,21 @@ int main(int argc, char* argv[]) ...@@ -92,19 +98,21 @@ int main(int argc, char* argv[])
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
K, K,
C, C,
in_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
out_spatial_lengths, output_spatial_lengths,
filter_strides, conv_filter_strides,
filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
PassThrough{}, PassThrough{},
PassThrough{}, PassThrough{},
PassThrough{}); PassThrough{},
split_k);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
...@@ -112,10 +120,10 @@ int main(int argc, char* argv[]) ...@@ -112,10 +120,10 @@ int main(int argc, char* argv[])
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = 2 * N * K * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C + std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
sizeof(WeiDataType) * K * Y * X * C + sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * N * Ho * Wo * K; sizeof(OutDataType) * G * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -134,10 +142,16 @@ int main(int argc, char* argv[]) ...@@ -134,10 +142,16 @@ int main(int argc, char* argv[])
} }
else else
{ {
std::cout << op_name << " does not support this problem" << std::endl; std::cerr << op_name << " does not support this problem" << std::endl;
} }
} }
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
...@@ -149,21 +163,22 @@ int main(int argc, char* argv[]) ...@@ -149,21 +163,22 @@ int main(int argc, char* argv[])
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(), wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
K, K,
C, C,
in_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
out_spatial_lengths, output_spatial_lengths,
filter_strides, conv_filter_strides,
filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
PassThrough{}, PassThrough{},
PassThrough{}, PassThrough{},
PassThrough{}); PassThrough{},
split_k);
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
...@@ -172,6 +187,4 @@ int main(int argc, char* argv[]) ...@@ -172,6 +187,4 @@ int main(int argc, char* argv[])
std::cout << "Done" << std::endl; std::cout << "Done" << std::endl;
} }
}
return 0;
}
\ No newline at end of file
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/library/utility/literals.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -135,15 +137,15 @@ int main(int argc, char* argv[]) ...@@ -135,15 +137,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -240,7 +242,7 @@ int main(int argc, char* argv[]) ...@@ -240,7 +242,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl; show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
} }
#endif #endif
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
} }
return 0; return 0;
......
...@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
a_m_k.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end()); ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end()); ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
} }
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
...@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>(); c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
#endif #endif
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -177,15 +178,15 @@ int main(int argc, char* argv[]) ...@@ -177,15 +178,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -271,8 +272,7 @@ int main(int argc, char* argv[]) ...@@ -271,8 +272,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor( Tensor<CShuffleDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -299,7 +299,7 @@ int main(int argc, char* argv[]) ...@@ -299,7 +299,7 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -155,15 +156,15 @@ int main(int argc, char* argv[]) ...@@ -155,15 +156,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -275,7 +276,7 @@ int main(int argc, char* argv[]) ...@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
} }
} }
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
if(config.do_verification) if(config.do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N}); Tensor<AccDataType> c_m_n({M, N});
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result); const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else #else
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#endif #endif
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification,
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err( return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
} }
return true; return true;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size) ...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{ {
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_}; std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_), ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions); return HostTensorDescriptor(dimensions);
} }
...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor, ...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert(size(descriptor.GetStrides()) == size(strides)); assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides)); std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
} }
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
conv_input.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end()); ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(), ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
conv_weight.end());
} }
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides); conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides); unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations)); ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads)); ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads)); ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device // run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{}; auto conv = DeviceInstance<NDimSpatial>{};
...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_device_buf.FromDevice(conv_output_device.mData.data()); conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data());
return ck::utils::check_err(conv_output_device.mData, return ck::utils::check_err(conv_output_device,
conv_output_host.mData, conv_output_host,
"Error: incorrect results! (Matrix E)", "Error: incorrect results! (Matrix E)",
1e-5f, 1e-5f,
1e-4f) && 1e-4f) &&
ck::utils::check_err(r0_device.mData, ck::utils::check_err(
r0_host.mData, r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
} }
return true; return true;
......
...@@ -324,12 +324,12 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -324,12 +324,12 @@ int reduce_blockwise_impl(bool do_verification,
#endif #endif
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_index_dev.FromDevice(out_indices.mData.data()); out_index_dev.FromDevice(out_indices.mData.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
}; };
}; };
......
...@@ -294,7 +294,7 @@ int main(int argc, char* argv[]) ...@@ -294,7 +294,7 @@ int main(int argc, char* argv[])
if(do_verify) if(do_verify)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment