Unverified Commit 0d911822 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Padded Generic Kernel Instance (#730)



* Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting

* Move the generic kernel instance to be the first of the instance list for elementwise op of normalization

* Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax

* Add testing of GetGenericInstance() in client_example of Softmax

* Revert "Add testing of GetGenericInstance() in client_example of Softmax"

This reverts commit f629cd9a93ce38dfed4886d849f3c38d2e5379c8.

* Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax"

This reverts commit a9f0d000eb9fd240404112a526ef125429a351df.

* Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm

* Move generic kernel instance to separate tuple for elementwise op of normalization

* Remove un-used files for softmax instance

* Store generic kernel instance to separate tuple for softmax

* Add IsSupported checking for generic instance to client example of softmax

* Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization

* clang-format fix

* Remove int8 from softmax instances

---------
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>
parent d140bdc9
......@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
instances)
{
add_device_operation_instances(instances,
device_normalization_f32_generic_instance<Swish, 5, 3>{});
add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
}
......
......@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
instances)
{
add_device_operation_instances(instances,
device_normalization_f16_generic_instance<Pass, 2, 1>{});
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
}
......
......@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
instances)
{
add_device_operation_instances(instances,
device_normalization_f32_generic_instance<Pass, 2, 1>{});
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
}
......
......@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
instances)
{
add_device_operation_instances(instances,
device_normalization_f16_generic_instance<Pass, 4, 3>{});
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
}
......
......@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
instances)
{
add_device_operation_instances(instances,
device_normalization_f32_generic_instance<Pass, 4, 3>{});
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
}
......
......@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
// clang-format on
>;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_normalization_f16_generic_instance = std::tuple<
// clang-format off
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
// clang-format on
>;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_normalization_f32_instances = std::tuple<
// clang-format off
......@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
// clang-format on
>;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_normalization_f32_generic_instance = std::tuple<
// clang-format off
DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
// clang-format on
>;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_normalization_f16_f32_f32_f16_instances = std::tuple<
// clang-format off
......@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
// clang-format on
>;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple<
// clang-format off
DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
// clang-format on
>;
} // namespace instance
} // namespace device
} // namespace tensor_operation
......
add_instance_library(device_softmax_instance
device_softmax_i8_i8_instance.cpp
device_softmax_i8_i8_instance_rank3_reduce1.cpp
device_softmax_i8_i8_instance_rank3_reduce2.cpp
device_softmax_i8_i8_instance_rank3_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce1.cpp
device_softmax_i8_i8_instance_rank4_reduce2.cpp
device_softmax_i8_i8_instance_rank4_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
......@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
{
add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
}
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
{
add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
}
} // namespace instance
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
{
add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
}
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
{
add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
}
} // namespace instance
......
......@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f32_f32_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
}
} // namespace instance
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment