Unverified Commit 9c052804 authored by Bartłomiej Kocot's avatar Bartłomiej Kocot Committed by GitHub
Browse files

Add elementwise with dynamic vector dim (#1198)

* Add elementwise with dynamic vector dim

* Reduce number of instaces

* Fixes

* Fixes
parent fd0d093e
...@@ -9,18 +9,13 @@ namespace tensor_operation { ...@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_permute_scale_5d_f16_instances( using Scale = element_wise::Scale;
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 5>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<5>{});
}
void add_device_permute_scale_5d_f32_instances( void add_device_permute_scale_5d_f16_instances(
std::vector<std::unique_ptr< std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Scale, 5>>>&
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 5>>>& instances) instances)
{ {
add_device_operation_instances(instances, device_permute_scale_f32_instances<5>{}); add_device_operation_instances(instances, device_permute_scale_f16_instances<5, Scale>{});
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using Scale = element_wise::Scale;
void add_device_permute_scale_5d_f32_instances(
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Scale, 5>>>&
instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<5, Scale>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -9,18 +9,13 @@ namespace tensor_operation { ...@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_permute_scale_6d_f16_instances( using Scale = element_wise::Scale;
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 6>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<6>{});
}
void add_device_permute_scale_6d_f32_instances( void add_device_permute_scale_6d_f16_instances(
std::vector<std::unique_ptr< std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Scale, 6>>>&
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 6>>>& instances) instances)
{ {
add_device_operation_instances(instances, device_permute_scale_f32_instances<6>{}); add_device_operation_instances(instances, device_permute_scale_f16_instances<6, Scale>{});
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using Scale = element_wise::Scale;
void add_device_permute_scale_6d_f32_instances(
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Scale, 6>>>&
instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<6, Scale>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -8,9 +8,9 @@ ...@@ -8,9 +8,9 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp" #include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp" #include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
...@@ -21,23 +21,12 @@ ...@@ -21,23 +21,12 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
namespace ck { namespace ck {
template <typename HostTensorA, template <typename HostTensorA, typename HostTensorB, typename ElementOp>
typename HostTensorB,
typename AElementOp,
typename BElementOp,
typename ScaleElementOp>
void reference_permute_scale(HostTensorB& b_tensor, void reference_permute_scale(HostTensorB& b_tensor,
const HostTensorA& a_tensor, const HostTensorA& a_tensor,
AElementOp a_tensor_op, ElementOp tensor_op)
BElementOp b_tensor_op,
ScaleElementOp scale_op)
{ {
b_tensor.ForEach([&](auto& self, auto idx) { b_tensor.ForEach([&](auto& self, auto idx) { tensor_op(self(idx), a_tensor(idx)); });
auto tmp_val = a_tensor(idx);
b_tensor_op(tmp_val, tmp_val);
scale_op(tmp_val, tmp_val);
a_tensor_op(self(idx), tmp_val);
});
} }
namespace profiler { namespace profiler {
...@@ -54,9 +43,7 @@ bool profile_permute_scale_impl(int do_verification, ...@@ -54,9 +43,7 @@ bool profile_permute_scale_impl(int do_verification,
bool pass = true; bool pass = true;
bool instance_found = false; bool instance_found = false;
using ElementOp = ck::tensor_operation::element_wise::PassThrough; using ElementOp = ck::tensor_operation::element_wise::Scale;
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
using Scale = ck::tensor_operation::element_wise::Scale;
float scale = 2.f; float scale = 2.f;
Tensor<ADataType> a(lengths_vector, input_strides_vector); Tensor<ADataType> a(lengths_vector, input_strides_vector);
...@@ -80,12 +67,8 @@ bool profile_permute_scale_impl(int do_verification, ...@@ -80,12 +67,8 @@ bool profile_permute_scale_impl(int do_verification,
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()}; std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()}; std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>, using DeviceOp = ck::tensor_operation::device::
ck::Tuple<BDataType>, DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, ElementOp, NumDim>;
ElementOp,
UnaryOp,
Scale,
NumDim>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -100,7 +83,7 @@ bool profile_permute_scale_impl(int do_verification, ...@@ -100,7 +83,7 @@ bool profile_permute_scale_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
reference_permute_scale(host_b, a, ElementOp{}, UnaryOp{}, Scale{scale}); reference_permute_scale(host_b, a, ElementOp{scale});
} }
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
...@@ -113,14 +96,8 @@ bool profile_permute_scale_impl(int do_verification, ...@@ -113,14 +96,8 @@ bool profile_permute_scale_impl(int do_verification,
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer(lengths, auto argument_ptr = op_ptr->MakeArgumentPointer(
{input_strides}, lengths, {input_strides}, {output_strides}, input, output, ElementOp{scale});
{output_strides},
input,
output,
ElementOp{},
UnaryOp{},
Scale{scale});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -141,6 +118,7 @@ bool profile_permute_scale_impl(int do_verification, ...@@ -141,6 +118,7 @@ bool profile_permute_scale_impl(int do_verification,
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_b: ", host_b.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
} }
} }
......
...@@ -37,6 +37,20 @@ static void print_helper_msg() ...@@ -37,6 +37,20 @@ static void print_helper_msg()
// clang-format on // clang-format on
} }
void init_strides(const std::vector<ck::index_t>& lengths,
const std::vector<ck::index_t>& dims_order,
std::vector<ck::index_t>& strides)
{
ck::index_t stride = 1;
for(ck::index_t d = lengths.size() - 1; d >= 0; d--)
{
ck::index_t dim = dims_order[d];
strides[dim] = stride;
stride *= lengths[dim];
}
}
} // namespace } // namespace
int profile_permute_scale(int argc, char* argv[]) int profile_permute_scale(int argc, char* argv[])
...@@ -58,16 +72,21 @@ int profile_permute_scale(int argc, char* argv[]) ...@@ -58,16 +72,21 @@ int profile_permute_scale(int argc, char* argv[])
const int num_dims = dims_argc / 3; const int num_dims = dims_argc / 3;
std::vector<ck::index_t> lengths(num_dims); std::vector<ck::index_t> lengths(num_dims);
std::vector<ck::index_t> input_strides(num_dims); std::vector<ck::index_t> input_dims_order(num_dims);
std::vector<ck::index_t> output_strides(num_dims); std::vector<ck::index_t> output_dims_order(num_dims);
for(int i = 0; i < num_dims; i++) for(int i = 0; i < num_dims; i++)
{ {
lengths[i] = std::stoi(argv[control_argc + i]); lengths[i] = std::stoi(argv[control_argc + i]);
input_strides[i] = std::stoi(argv[control_argc + num_dims + i]); input_dims_order[i] = std::stoi(argv[control_argc + num_dims + i]);
output_strides[i] = std::stoi(argv[control_argc + 2 * num_dims + i]); output_dims_order[i] = std::stoi(argv[control_argc + 2 * num_dims + i]);
} }
std::vector<ck::index_t> input_strides(num_dims);
std::vector<ck::index_t> output_strides(num_dims);
init_strides(lengths, input_dims_order, input_strides);
init_strides(lengths, output_dims_order, output_strides);
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
......
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
echo $DRIVER
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
# 1D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 67108864 0 0
# # 2D
# ######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8192 8192 0 1 1 0
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8192 8192 1 0 0 1
# 3D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 1024 8192 0 1 2 2 1 0
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 1024 8192 2 1 0 0 1 2
# 4D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 512 8192 0 1 2 3 3 2 1 0
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 512 8192 3 2 1 0 0 1 2 3
# 5D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 256 8192 0 1 2 3 4 4 3 2 1 0
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 256 8192 4 3 2 1 0 0 1 2 3 4
# 6D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 2 128 8192 0 1 2 3 4 5 5 4 3 2 1 0
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 2 128 8192 5 4 3 2 1 0 0 1 2 3 4 5
...@@ -52,40 +52,40 @@ TYPED_TEST_SUITE(TestPermute, KernelTypes); ...@@ -52,40 +52,40 @@ TYPED_TEST_SUITE(TestPermute, KernelTypes);
TYPED_TEST(TestPermute, Test1D) TYPED_TEST(TestPermute, Test1D)
{ {
constexpr ck::index_t NumDims = 1; constexpr ck::index_t NumDims = 1;
this->template Run<NumDims>({8}, {1}, {2}); this->template Run<NumDims>({16}, {1}, {1});
this->template Run<NumDims>({8}, {2}, {1}); this->template Run<NumDims>({16}, {1}, {2});
this->template Run<NumDims>({1}, {1}, {1}); this->template Run<NumDims>({1}, {1}, {1});
} }
TYPED_TEST(TestPermute, Test2D) TYPED_TEST(TestPermute, Test2D)
{ {
constexpr ck::index_t NumDims = 2; constexpr ck::index_t NumDims = 2;
this->template Run<NumDims>({8, 4}, {4, 1}, {1, 8}); this->template Run<NumDims>({8, 16}, {16, 1}, {1, 8});
this->template Run<NumDims>({8, 4}, {1, 8}, {4, 1}); this->template Run<NumDims>({8, 16}, {1, 8}, {16, 1});
this->template Run<NumDims>({1, 1}, {1, 1}, {1, 1}); this->template Run<NumDims>({1, 1}, {1, 1}, {1, 1});
} }
TYPED_TEST(TestPermute, Test3D) TYPED_TEST(TestPermute, Test3D)
{ {
constexpr ck::index_t NumDims = 3; constexpr ck::index_t NumDims = 3;
this->template Run<NumDims>({2, 4, 4}, {16, 4, 1}, {1, 2, 8}); this->template Run<NumDims>({8, 2, 8}, {16, 8, 1}, {1, 8, 16});
this->template Run<NumDims>({2, 4, 4}, {1, 2, 8}, {16, 4, 1}); this->template Run<NumDims>({8, 2, 8}, {1, 8, 16}, {16, 8, 1});
this->template Run<NumDims>({1, 1, 1}, {1, 1, 1}, {1, 1, 1}); this->template Run<NumDims>({1, 1, 1}, {1, 1, 1}, {1, 1, 1});
} }
TYPED_TEST(TestPermute, Test4D) TYPED_TEST(TestPermute, Test4D)
{ {
constexpr ck::index_t NumDims = 4; constexpr ck::index_t NumDims = 4;
this->template Run<NumDims>({2, 4, 4, 4}, {64, 16, 4, 1}, {1, 2, 8, 32}); this->template Run<NumDims>({8, 2, 3, 8}, {48, 24, 8, 1}, {1, 8, 16, 48});
this->template Run<NumDims>({2, 4, 4, 4}, {1, 2, 8, 32}, {64, 16, 4, 1}); this->template Run<NumDims>({8, 2, 3, 8}, {1, 8, 16, 48}, {48, 24, 8, 1});
this->template Run<NumDims>({1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}); this->template Run<NumDims>({1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1});
} }
TYPED_TEST(TestPermute, Test5D) TYPED_TEST(TestPermute, Test5D)
{ {
constexpr ck::index_t NumDims = 5; constexpr ck::index_t NumDims = 5;
this->template Run<NumDims>({2, 4, 4, 4, 4}, {256, 64, 16, 4, 1}, {1, 2, 8, 32, 128}); this->template Run<NumDims>({8, 2, 3, 4, 8}, {192, 96, 32, 8, 1}, {1, 8, 16, 48, 192});
this->template Run<NumDims>({2, 4, 4, 4, 4}, {1, 2, 8, 32, 128}, {256, 64, 16, 4, 1}); this->template Run<NumDims>({8, 2, 3, 4, 8}, {1, 8, 16, 48, 192}, {192, 96, 32, 8, 1});
this->template Run<NumDims>({1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}); this->template Run<NumDims>({1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1});
} }
...@@ -93,8 +93,8 @@ TYPED_TEST(TestPermute, Test6D) ...@@ -93,8 +93,8 @@ TYPED_TEST(TestPermute, Test6D)
{ {
constexpr ck::index_t NumDims = 6; constexpr ck::index_t NumDims = 6;
this->template Run<NumDims>( this->template Run<NumDims>(
{2, 4, 4, 4, 4, 4}, {1024, 256, 64, 16, 4, 1}, {1, 2, 8, 32, 128, 512}); {8, 2, 3, 4, 5, 8}, {960, 480, 160, 40, 8, 1}, {1, 8, 16, 48, 192, 960});
this->template Run<NumDims>( this->template Run<NumDims>(
{2, 4, 4, 4, 4, 4}, {1, 2, 8, 32, 128, 512}, {1024, 256, 64, 16, 4, 1}); {8, 2, 3, 4, 5, 8}, {1, 8, 16, 48, 192, 960}, {960, 480, 160, 40, 8, 1});
this->template Run<NumDims>({1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}); this->template Run<NumDims>({1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1});
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment