Commit 19bb36d7 authored by wangshaojie6's avatar wangshaojie6
Browse files

merge dev branch

parents 3e7f7997 2e6eaf6e
......@@ -11,9 +11,6 @@
#include "blockwise_tensor_slice_transfer_v6r1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#define A_BLOCK_BANK_CONFLICT_FREE_WRW 1
#define B_BLOCK_BANK_CONFLICT_FREE_WRW 1
namespace ck {
template <typename GridwiseGemm,
......@@ -43,6 +40,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const CBlockClusterAdaptor c_block_cluster_adaptor)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
......@@ -59,6 +57,18 @@ __global__ void
b_element_op,
c_element_op,
c_block_cluster_adaptor);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = a_b_k0_m_k1_grid_desc;
ignore = b_b_k0_n_k1_grid_desc;
ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = c_block_cluster_adaptor;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <index_t BlockSize,
......@@ -99,7 +109,9 @@ template <index_t BlockSize,
index_t CShuffleMRepeatPerShuffle,
index_t CShuffleNRepeatPerShuffle,
index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>
typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
bool ABlockLdsExtraM1Wrw = false,
bool BBlockLdsExtraN1Wrw = false>
struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
{
static constexpr auto I0 = Number<0>{};
......@@ -111,20 +123,22 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
static constexpr auto I6 = Number<6>{};
static constexpr auto I7 = Number<7>{};
// Bytes per 32 lds bank: 32 * 4 bytes
static constexpr auto BankLength = Number<128>{};
static constexpr auto ElePerBank = Number<BankLength / sizeof(FloatAB)>{};
// K1 should be Number<...>
static constexpr auto K1 = Number<K1Value>{};
// M1 & N1
static constexpr auto ElePerBank = Number<64>{};
// M1 & M0
static constexpr auto M1PerBlock = Number<ElePerBank / K1Value>{};
static constexpr auto N1PerBlock = Number<ElePerBank / K1Value>{};
// M0 & N0
static constexpr auto M0PerBlock = Number<MPerBlock / M1PerBlock>{};
static constexpr auto N0PerBlock = Number<NPerBlock / M1PerBlock>{};
static constexpr auto M1Padding = I4;
// M1 padding num
static constexpr auto M1Padding = Number<4>{};
static constexpr auto N1Padding = M1Padding;
// N1 & N0
static constexpr auto N1PerBlock = Number<ElePerBank / K1Value>{};
static constexpr auto N0PerBlock = Number<NPerBlock / M1PerBlock>{};
static constexpr auto N1Padding = I4;
__host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
{
......@@ -134,26 +148,33 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto a_block_desc_k0_m_k1 = [&]() {
if constexpr(ABlockLdsExtraM)
{
#if A_BLOCK_BANK_CONFLICT_FREE_WRW
if constexpr(ABlockLdsExtraM1Wrw)
{
constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor(
make_tuple(Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
make_tuple(Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M1PerBlock>{} * K1 + M1Padding, K1, I1));
make_tuple(
Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
make_tuple(Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
Number<M1PerBlock>{} * K1 + M1Padding,
K1,
I1));
constexpr auto a_block_desc_k0_m_k1_tmp = transform_tensor_descriptor(
a_block_desc_k0_m0_m1_k1,
make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
make_merge_transform_v3_division_mod(make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
make_merge_transform_v3_division_mod(
make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
make_pass_through_transform(K1)),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})
);
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
return a_block_desc_k0_m_k1_tmp;
#else
}
else
{
return make_naive_tensor_descriptor(
make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
#endif
}
}
else
{
......@@ -173,32 +194,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto a_block_desc_b_k0_m_k1 = [&]() {
if constexpr(ABlockLdsExtraM)
{
#if A_BLOCK_BANK_CONFLICT_FREE_WRW
if constexpr(ABlockLdsExtraM1Wrw)
{
constexpr auto a_block_desc_b_k0_m0_m1_k1 = make_naive_tensor_descriptor(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
make_tuple(Number<K0PerBlock>{} * Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding), Number<M1PerBlock>{} * K1 + M1Padding, K1, I1));
make_tuple(Number<1>{},
Number<K0PerBlock>{},
Number<M0PerBlock>{},
Number<M1PerBlock>{},
K1),
make_tuple(Number<K0PerBlock>{} * Number<M0PerBlock>{} *
(Number<M1PerBlock>{} * K1 + M1Padding),
Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
Number<M1PerBlock>{} * K1 + M1Padding,
K1,
I1));
constexpr auto a_block_desc_b_k0_m_k1_tmp = transform_tensor_descriptor(
a_block_desc_b_k0_m0_m1_k1,
make_tuple(make_pass_through_transform(Number<1>{}),
make_pass_through_transform(Number<K0PerBlock>{}),
make_merge_transform_v3_division_mod_for_wrw(make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
make_merge_transform_v3_division_mod_for_wrw(
make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
make_pass_through_transform(K1)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})
);
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
return a_block_desc_b_k0_m_k1_tmp;
#else
}
else
{
return make_naive_tensor_descriptor(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1, Number<MPerBlock + 1>{} * K1, K1, I1));
#endif
make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1,
Number<MPerBlock + 1>{} * K1,
K1,
I1));
}
}
else
{
return make_naive_tensor_descriptor_aligned(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
max_lds_align);
}
}();
......@@ -213,27 +250,33 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto b_block_desc_k0_n_k1 = [&]() {
if constexpr(BBlockLdsExtraN)
{
#if B_BLOCK_BANK_CONFLICT_FREE_WRW
if constexpr(BBlockLdsExtraN1Wrw)
{
constexpr auto b_block_desc_k0_n0_n1_k1 = make_naive_tensor_descriptor(
make_tuple(Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
make_tuple(Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N1PerBlock>{} * K1 + N1Padding, K1, I1));
make_tuple(
Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
make_tuple(Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
Number<N1PerBlock>{} * K1 + N1Padding,
K1,
I1));
constexpr auto b_block_desc_k0_n_k1_tmp = transform_tensor_descriptor(
b_block_desc_k0_n0_n1_k1,
make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
make_merge_transform_v3_division_mod(make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
make_merge_transform_v3_division_mod(
make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
make_pass_through_transform(K1)),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})
);
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
return b_block_desc_k0_n_k1_tmp;
#else
}
else
{
return make_naive_tensor_descriptor(
make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
#endif
}
}
else
{
......@@ -253,32 +296,48 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto b_block_desc_b_k0_n_k1 = [&]() {
if constexpr(BBlockLdsExtraN)
{
#if B_BLOCK_BANK_CONFLICT_FREE_WRW
if constexpr(BBlockLdsExtraN1Wrw)
{
constexpr auto b_block_desc_b_k0_n0_n1_k1 = make_naive_tensor_descriptor(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
make_tuple(Number<K0PerBlock>{} * Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding), Number<N1PerBlock>{} * K1 + N1Padding, K1, I1));
make_tuple(Number<1>{},
Number<K0PerBlock>{},
Number<N0PerBlock>{},
Number<N1PerBlock>{},
K1),
make_tuple(Number<K0PerBlock>{} * Number<N0PerBlock>{} *
(Number<N1PerBlock>{} * K1 + N1Padding),
Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
Number<N1PerBlock>{} * K1 + N1Padding,
K1,
I1));
constexpr auto b_block_desc_b_k0_n_k1_tmp = transform_tensor_descriptor(
b_block_desc_b_k0_n0_n1_k1,
make_tuple(make_pass_through_transform(Number<1>{}),
make_pass_through_transform(Number<K0PerBlock>{}),
make_merge_transform_v3_division_mod_for_wrw(make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
make_merge_transform_v3_division_mod_for_wrw(
make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
make_pass_through_transform(K1)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})
);
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
return b_block_desc_b_k0_n_k1_tmp;
#else
}
else
{
return make_naive_tensor_descriptor(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1, Number<NPerBlock + 1>{} * K1, K1, I1));
#endif
make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1,
Number<NPerBlock + 1>{} * K1,
K1,
I1));
}
}
else
{
return make_naive_tensor_descriptor_aligned(
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
max_lds_align);
}
}();
......@@ -296,11 +355,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
constexpr auto b_b_k0_n_k1_block_desc = GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1();
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size =
math::integer_least_multiple(a_b_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto a_block_space_size = math::integer_least_multiple(
a_b_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_space_size =
math::integer_least_multiple(b_b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_space_size = math::integer_least_multiple(
b_b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto c_block_size =
GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();
......@@ -557,8 +616,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
// register
// sanity check
constexpr index_t KPack = math::max(
K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
constexpr index_t KPack =
math::max(K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
auto blockwise_gemm =
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
......
......@@ -42,6 +42,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
......@@ -56,6 +57,18 @@ __global__ void
b_element_op,
c_element_op,
block_2_ctile_map);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = a_grid_desc_ak0_m_ak1;
ignore = b_grid_desc_bk0_n_bk1;
ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = block_2_ctile_map;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <
......
......@@ -45,6 +45,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
......@@ -61,6 +62,20 @@ __global__ void
b_element_op,
c_element_op,
block_2_ctile_map);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = p_c0_grid;
ignore = a_grid_desc_k0_m_k1;
ignore = b_grid_desc_k0_n_k1;
ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = block_2_ctile_map;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <
......
......@@ -49,6 +49,7 @@ __global__ void
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainK0BlockLoop>(
......@@ -67,6 +68,22 @@ __global__ void
b_element_op,
c_element_op,
block_2_ctile_map);
#else
ignore = p_a_grid;
ignore = p_b_grid;
ignore = p_c_grid;
ignore = p_c0_grid;
ignore = p_c1_grid;
ignore = a_grid_desc_k0_m_k1;
ignore = b_grid_desc_k0_n_k1;
ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
ignore = a_element_op;
ignore = b_element_op;
ignore = c_element_op;
ignore = block_2_ctile_map;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template <
......
......@@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
DataType value)
{
using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<DataType, DataType>;
constexpr auto I0 = Number<0>{};
......
......@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
if constexpr(SrcVectorDim != DstVectorDim &&
is_same<half_t, remove_cvref_t<SrcData>>::value &&
((is_same<half_t, remove_cvref_t<SrcData>>::value &&
is_same<half_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
(is_same<int8_t, remove_cvref_t<SrcData>>::value &&
is_same<int8_t, remove_cvref_t<DstData>>::value &&
SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
{
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
......
......@@ -13,6 +13,7 @@
#include "functional3.hpp"
#include "functional4.hpp"
#include "enable_if.hpp"
#include "ignore.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
......@@ -30,6 +31,7 @@
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
......
......@@ -992,77 +992,6 @@ inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
return uint16_t(u.int32 >> 16);
}
// TODO: deprecate this
template <typename T>
struct inner_product_with_conversion
{
template <typename X, index_t N>
__device__ T operator()(typename vector_type<X, N>::type a,
typename vector_type<X, N>::type b) const
{
const vector_type<X, N> a_vector{a};
const vector_type<X, N> b_vector{b};
T acc = 0;
static_for<0, N, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.Scalars()[i]) * type_convert<T>(b_vector.Scalars()[i]);
});
return acc;
}
__device__ T operator()(float_t a, float_t b) const
{
return type_convert<T>(a) * type_convert<T>(b);
}
__device__ T operator()(int8x4_t a, int8x4_t b) const
{
const vector_type<int8_t, 4> a_vector{a};
const vector_type<int8_t, 4> b_vector{b};
T acc = 0;
static_for<0, 4, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
__device__ T operator()(int8x8_t a, int8x8_t b) const
{
const vector_type<int8_t, 8> a_vector{a};
const vector_type<int8_t, 8> b_vector{b};
T acc = 0;
static_for<0, 8, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
__device__ T operator()(int8x16_t a, int8x16_t b) const
{
const vector_type<int8_t, 16> a_vector{a};
const vector_type<int8_t, 16> b_vector{b};
T acc = 0;
static_for<0, 16, 1>{}([&](auto i) {
acc += type_convert<T>(a_vector.AsType<int8_t>()[i]) *
type_convert<T>(b_vector.AsType<int8_t>()[i]);
});
return acc;
}
};
template <typename T>
struct NumericLimits
{
......
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
#include "enable_if.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
namespace ck {
// T may be scalar or vector
// X may be scalar or vector
// T and X have same scalar type
// X contains multiple T
template <AddressSpaceEnum BufferAddressSpace,
typename T,
typename ElementSpaceSize,
......@@ -266,9 +271,6 @@ struct DynamicBuffer
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
#else
//if(get_block_1d_id() == 0){
// printf("%d, tid=%d, i=%d\n", __LINE__, get_thread_local_1d_id(), i);
//}
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
#endif
}
......@@ -319,9 +321,7 @@ struct DynamicBuffer
{
if(is_valid_element)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
}
}
}
......
#pragma once
#include "data_type.hpp"
namespace ck {
template <typename X>
__device__ X atomic_add(X* p_dst, const X& x);
template <>
__device__ int32_t atomic_add<int32_t>(int32_t* p_dst, const int32_t& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ uint32_t atomic_add<uint32_t>(uint32_t* p_dst, const uint32_t& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ float atomic_add<float>(float* p_dst, const float& x)
{
return atomicAdd(p_dst, x);
}
template <>
__device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const vector_type<float, 2> vx{x};
vector_type<float, 2> vy{0};
vy.template AsType<float>()(I0) =
atomicAdd(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
vy.template AsType<float>()(I1) =
atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
return vy.template AsType<float2_t>()[I0];
}
} // namespace ck
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "half.hpp"
namespace ck {
namespace math {
static inline __device__ half_t abs(half_t x) { return __habs(x); };
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
static inline __host__ float abs(float x) { return std::abs(x); };
static inline __host__ double abs(double x) { return std::abs(x); };
static inline __host__ int8_t abs(int8_t x)
{
int8_t sgn = x >> (8 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ int32_t abs(int32_t x)
{
int32_t sgn = x >> (32 - 1);
return (x ^ sgn) - sgn;
};
static inline __host__ half_t abs(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
half_float::half abs_xx = half_float::abs(xx);
half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
return abs_x;
};
static inline __host__ float isnan(float x) { return std::isnan(x); };
static inline __host__ double isnan(double x) { return std::isnan(x); };
static inline __host__ int8_t isnan(int8_t x)
{
(void)x;
return false;
};
static inline __host__ int32_t isnan(int32_t x)
{
(void)x;
return false;
};
static inline __host__ bool isnan(half_t x)
{
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
return half_float::isnan(xx);
};
} // namespace math
} // namespace ck
......
......@@ -33,7 +33,7 @@ namespace ck {
struct float_equal_one
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
};
......@@ -42,7 +42,7 @@ struct float_equal_one
struct float_equal_zero
{
template <class T>
__device__ inline bool operator()(T x)
__host__ __device__ inline bool operator()(T x)
{
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
};
......
......@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template <index_t NX, index_t NY>
struct transpose_vectors<half_t, NX, NY>
{
// we got [NY * NX] ammount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
......@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
};
// transpose int8 4x4
__device__ void transpose_int8_4x4(const int8x4_t& x0,
const int8x4_t& x1,
const int8x4_t& x2,
const int8x4_t& x3,
int8x4_t& y0,
int8x4_t& y1,
int8x4_t& y2,
int8x4_t& y3)
{
int32_t t0, t1;
int32_t z0, z1, z2, z3;
constexpr int32_t m0 = 0x05010400;
constexpr int32_t m1 = 0x05040100;
constexpr int32_t m2 = 0x07060302;
constexpr int32_t m3 = 0x07030602;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
// clang-format on
y0 = bit_cast<int8x4_t>(z0);
y1 = bit_cast<int8x4_t>(z1);
y2 = bit_cast<int8x4_t>(z2);
y3 = bit_cast<int8x4_t>(z3);
}
template <index_t NX, index_t NY>
struct transpose_vectors<int8_t, NX, NY>
{
// we got [NY * NX] amount of S data to be transposed
static constexpr index_t s_per_x = NY;
static constexpr index_t s_per_y = NX;
using S = int8_t;
using VX = vector_type<int8_t, s_per_x>;
using VY = vector_type<int8_t, s_per_y>;
__device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
StaticallyIndexedArray<VY&, NY>& vy_tuple)
{
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for<0, NY, 4>{}([&](auto iy) {
static_for<0, NX, 4>{}([&](auto ix) {
// reference to 4 int8 data from vx_tuple
const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
// reference to 4 int8 data from vy_tuple
auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
// transpose
transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
});
});
}
};
} // namespace ck
#endif
add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/utility)
......@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <cmath>
#include <cassert>
......@@ -34,6 +33,8 @@
#include <string>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace ck {
......@@ -42,34 +43,10 @@ namespace host_reduce {
using ck::NanPropagation;
using ck::ReduceTensorOp;
template <typename T>
static inline bool float_equal_one(T);
static inline bool float_equal_one(float x) { return x == 1.0f; };
static inline bool float_equal_one(double x) { return x == 1.0; };
static inline bool float_equal_one(half_float::half x)
{
return x == static_cast<half_float::half>(1.0f);
};
template <typename T>
static inline bool float_equal_zero(T x);
static inline bool float_equal_zero(float x) { return x == 0.0f; };
static inline bool float_equal_zero(double x) { return x == 0.0; };
static inline bool float_equal_zero(half_float::half x)
{
return x == static_cast<half_float::half>(0.0f);
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{
using std::abs;
using ck::math::abs;
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{
......@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return (std::numeric_limits<AccDataType>::max());
return (ck::NumericLimits<AccDataType>::Max());
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{
return (std::numeric_limits<AccDataType>::lowest());
return (ck::NumericLimits<AccDataType>::Lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
......@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType& accuVal,
AccDataType currVal)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int& accuIndex,
int currIndex)
{
using std::isnan;
using ck::math::isnan;
if constexpr(!PropagateNan)
{
......
......@@ -32,6 +32,7 @@
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
......@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......@@ -303,10 +304,10 @@ struct ReductionHost
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::float_equal_one;
using ck::float_equal_zero;
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
......@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
......@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
if(!float_equal_zero{}(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
......
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace ck {
namespace utils {
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template <typename T>
struct FillUniform
{
float a_{0};
float b_{5};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::mt19937 gen{11939};
std::uniform_real_distribution<> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
}
};
template <typename T>
struct FillMonotonicSeq
{
T init_value_{0};
T step_{1};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::generate(first, last, [=, n = init_value_]() mutable {
auto tmp = n;
n += step_;
return tmp;
});
}
};
template <typename T>
struct FillConstant
{
T value_{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, value_);
}
};
} // namespace utils
} // namespace ck
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace ck {
namespace utils {
struct ProfileBestConfig
{
std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_tflops = std::numeric_limits<float>::max();
float best_gb_per_sec = std::numeric_limits<float>::max();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstance
{
public:
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
OpInstance() = default;
OpInstance(const OpInstance&) = default;
OpInstance& operator=(const OpInstance&) = default;
virtual ~OpInstance(){};
virtual InTensorsTuple GetInputTensors() const = 0;
virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
virtual std::unique_ptr<tensor_operation::device::BaseArgument>
MakeArgumentPointer(tensor_operation::device::BaseOperator*,
const DeviceBuffers&,
const DeviceMemPtr&) const = 0;
virtual std::size_t GetFlops() const = 0;
virtual std::size_t GetBtype() const = 0;
};
/**
* @brief A generic operation instance run engine.
*/
template <typename OutDataType, typename... InArgTypes>
class OpInstanceRunEngine
{
public:
using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
template <typename T>
using TensorPtr = std::unique_ptr<Tensor<T>>;
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
using DeviceBuffers = std::vector<DeviceMemPtr>;
using InArgsTypesTuple = std::tuple<InArgTypes...>;
OpInstanceRunEngine() = delete;
template <typename ReferenceOp = std::function<void()>>
OpInstanceRunEngine(const OpInstanceT& op_instance,
const ReferenceOp& reference_op = ReferenceOp{})
: op_instance_{op_instance}
{
in_tensors_ = op_instance_.GetInputTensors();
out_tensor_ = op_instance_.GetOutputTensor();
if constexpr(std::is_invocable_v<ReferenceOp,
const Tensor<InArgTypes>&...,
Tensor<OutDataType>&>)
{
ref_output_ = op_instance_.GetOutputTensor();
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
}
AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
out_device_buffer_ =
std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
out_device_buffer_->SetZero();
}
virtual ~OpInstanceRunEngine(){};
template <typename OpInstancePtr>
bool Test(const std::vector<OpInstancePtr>& op_ptrs)
{
bool res{true};
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
invoker->Run(argument.get());
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && check_err(out_tensor_->mData, ref_output_->mData);
out_device_buffer_->SetZero();
}
}
return res;
}
template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100,
bool do_verification = false,
bool do_log = false)
{
bool res{true};
ProfileBestConfig best_config;
for(auto& op_ptr : op_ptrs)
{
auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get());
auto argument = op_instance_.MakeArgumentPointer(
op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get()))
{
std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat);
std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype();
float tflops = static_cast<float>(flops) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << op_name << std::endl;
if(tflops < best_config.best_tflops)
{
best_config.best_op_name = op_name;
best_config.best_tflops = tflops;
best_config.best_gb_per_sec = gb_per_sec;
best_config.best_avg_time = avg_time;
}
if(do_verification)
{
out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_)
{
throw std::runtime_error(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function.");
}
// TODO: enable flexible use of custom check_error functions
res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
if(do_log) {}
}
out_device_buffer_->SetZero();
}
}
return best_config;
}
void SetAtol(double a) { atol_ = a; }
void SetRtol(double r) { rtol_ = r; }
private:
template <typename F, std::size_t... Is>
void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
{
f(*std::get<Is>(in_tensors_)..., *ref_output_);
}
template <std::size_t... Is>
void AllocateDeviceInputTensors(std::index_sequence<Is...>)
{
(AllocateDeviceInputTensorsImpl<Is>(), ...);
}
template <std::size_t Index>
void AllocateDeviceInputTensorsImpl()
{
const auto& ts = std::get<Index>(in_tensors_);
in_device_buffers_
.emplace_back(
std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
ts->mDesc.GetElementSpace()))
->ToDevice(ts->mData.data());
}
static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
const OpInstanceT& op_instance_;
double rtol_{1e-5};
double atol_{1e-8};
InTensorsTuple in_tensors_;
TensorPtr<OutDataType> out_tensor_;
TensorPtr<OutDataType> ref_output_;
DeviceBuffers in_device_buffers_;
DeviceMemPtr out_device_buffer_;
template <typename T>
bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
{
return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
}
};
} // namespace utils
} // namespace ck
......@@ -31,7 +31,6 @@ using device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
//DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<1, 4, 32, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 4, 4, true, S<1, 4, 64, 1>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 4, 8, true, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, S<1, 4, 16, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 4, true, 1, 1, S<1, 32, 1, 4>, 8>,
DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, S<1, 4, 16, 4>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 8, 2, true, 1, 1, S<1, 32, 1, 4>, 8>,
//DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<1, 4, 32, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 4, 4, true, S<1, 4, 32, 2>, S<0, 3, 1, 2>, S<0, 2, 1, 3>, 2, 4, 4, true, 1, 1, S<1, 32, 1, 4>, 8>,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment