"tests/nn/git@developer.sourcefind.cn:OpenDAS/fairscale.git" did not exist on "885533739160d4f25da4a2ba6022ac1f5d755a91"
Commit a9031464 authored by Chao Liu's avatar Chao Liu
Browse files

implicit gemm v1r3 nchw_cyxk_nkhw

parent 569ad66e
...@@ -128,7 +128,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -128,7 +128,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet constexpr index_t InBlockReorderDataPerWrite_N = 1;
using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerRead_K = 4;
...@@ -163,7 +163,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, ...@@ -163,7 +163,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet constexpr index_t InBlockReorderDataPerWrite_N = 2;
using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerRead_K = 4;
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hip.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
index_t nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr index_t Hi = in_nchw_desc.GetLength(I2);
constexpr index_t Wi = in_nchw_desc.GetLength(I3);
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
constexpr index_t K = wei_kcyx_desc.GetLength(I0);
constexpr index_t C = wei_kcyx_desc.GetLength(I1);
constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
};
make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
std::thread::hardware_concurrency());
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0
// for 3x3, 28x28, v1r2, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 2;
constexpr index_t InBlockReorderDataPerWrite_N = 4;
using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 0
// for 3x3, 28x28, v1r3, Pascal, bad
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 1
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 16;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 4;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#endif
constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw<
GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_cyxk_desc),
decltype(out_nkhw_desc),
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N,
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
...@@ -605,8 +605,10 @@ int main(int argc, char* argv[]) ...@@ -605,8 +605,10 @@ int main(int argc, char* argv[])
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 1 #elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 1
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#endif #endif
......
...@@ -24,7 +24,7 @@ struct Array ...@@ -24,7 +24,7 @@ struct Array
{ {
Array<TData, NSize + 1> new_array; Array<TData, NSize + 1> new_array;
static_for<0, NSize, 1>{}([=](auto I) { static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get(); constexpr index_t i = I.Get();
new_array[i] = mData[i]; new_array[i] = mData[i];
}); });
......
...@@ -137,11 +137,16 @@ struct ConstantTensorDescriptor ...@@ -137,11 +137,16 @@ struct ConstantTensorDescriptor
} }
template <index_t... Is> template <index_t... Is>
__host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> multi_id) __host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> /*multi_id*/)
{ {
static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent"); static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent");
return Get1dIndex(Is...); constexpr auto multi_id = Sequence<Is...>{};
constexpr auto seq_tmp =
transform_sequences(mod_conv::multiplies<index_t>{}, multi_id, GetStrides());
return accumulate_on_sequence(seq_tmp, mod_conv::plus<index_t>{}, Number<0>{});
} }
__host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id) __host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id)
......
...@@ -246,7 +246,8 @@ struct accumulate_on_sequence_f ...@@ -246,7 +246,8 @@ struct accumulate_on_sequence_f
}; };
template <class Seq, class Reduce, index_t I> template <class Seq, class Reduce, index_t I>
__host__ __device__ constexpr index_t accumulate_on_sequence(Seq, Reduce, Number<I>) __host__ __device__ constexpr index_t
accumulate_on_sequence(Seq, Reduce, Number<I> /*initial_value*/)
{ {
constexpr index_t a = constexpr index_t a =
static_const_reduce_n<Seq::mSize>{}(accumulate_on_sequence_f<Seq>{}, Reduce{}); static_const_reduce_n<Seq::mSize>{}(accumulate_on_sequence_f<Seq>{}, Reduce{});
......
...@@ -471,7 +471,6 @@ struct Blockwise2dTensorCopy3 ...@@ -471,7 +471,6 @@ struct Blockwise2dTensorCopy3
DstDesc{}.GetStride(I0) % DataPerRead == 0, DstDesc{}.GetStride(I0) % DataPerRead == 0,
"src and dst stride should be multiple of DataPerRead to keep alignment"); "src and dst stride should be multiple of DataPerRead to keep alignment");
constexpr index_t L0 = CopyLengths{}.Get(I0);
constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
......
...@@ -761,339 +761,3 @@ struct Blockwise4dTensorCopyReorder1 ...@@ -761,339 +761,3 @@ struct Blockwise4dTensorCopyReorder1
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
} }
}; };
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SrcLengths,
class SrcSubLengths,
class SrcClusterLengths,
class MapDst2Src,
class MapThreadCluster2SrcCluster,
index_t SrcDataPerRead,
index_t DstDataPerWrite>
struct Blockwise4dTensorCopyReorder3
{
static constexpr index_t nDim = SrcLengths::GetSize();
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ Blockwise4dTensorCopyReorder3()
{
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto src_lengths = SrcLengths{};
constexpr auto map_dst2src = MapDst2Src{};
constexpr auto src_sub_lengths = SrcSubLengths{};
constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src);
constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{};
constexpr auto src_cluster_lengths = SrcClusterLengths{};
constexpr auto thread_cluster_lengths =
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
// sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
// sanity check: nDim
static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim &&
SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim &&
SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim &&
MapThreadCluster2SrcCluster::GetSize() == nDim,
"wrong! nDim is not consistent\n");
// sanity check: BlockSize
constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize();
static_assert(BlockSize >= num_active_thread,
"wrong! BlockSize is not big enough for ThreadPerDims!");
// sanity check: work division
static_for<0, nDim, 1>{}([](auto IDim) {
constexpr auto I = decltype(IDim){};
constexpr index_t src_len = src_lengths.Get(I);
constexpr index_t src_sub_len = src_sub_lengths.Get(I);
constexpr index_t src_cluster_len = src_cluster_lengths.Get(I);
static_assert(src_len % (src_sub_len * src_cluster_len) == 0,
"wrong! cannot evenly divide Src tensor lengths");
});
// sanity check: src read
static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4,
"wrong! only support SrcDataPerRead == 1, 2 or 4!\n");
static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number<nDim - 1>{}) == 1,
"wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n");
static_assert(src_sub_lengths.Get(Number<nDim - 1>{}) % SrcDataPerRead == 0,
"wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n");
static_assert(src_desc.GetStride(Number<nDim - 2>{}) % SrcDataPerRead == 0,
"wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
"keep alignment");
// sanity check: dst write
static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
"wrong! only support DstDataPerWrite == 1, 2 or 4!\n");
static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number<nDim - 1>{}) == 1,
"wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n");
static_assert(dst_sub_lengths.Get(Number<nDim - 1>{}) % DstDataPerWrite == 0,
"wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n");
static_assert(dst_desc.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
"wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
"keep alignment");
// start dividing work
if(BlockSize > num_active_thread)
{
if(get_thread_local_1d_id() >= num_active_thread)
{
return;
}
}
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy???
auto src_data_multi_id =
reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto I = decltype(IDim){};
constexpr index_t i = I.Get();
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
src_data_multi_id[i] *= src_sub_lengths.Get(I);
});
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id);
mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id);
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"thread_multi_id %5u %5u %5u %5u, "
"src_data_multi_id %5u %5u %5u %5u, "
"dst_data_multi_id %5u %5u %5u %5u, "
"mSrcMyThreadOffset %u, mDstMyThreadOffset %u\n",
get_thread_local_1d_id(),
thread_multi_id[0],
thread_multi_id[1],
thread_multi_id[2],
thread_multi_id[3],
src_data_multi_id[0],
src_data_multi_id[1],
src_data_multi_id[2],
src_data_multi_id[3],
dst_data_multi_id[0],
dst_data_multi_id[1],
dst_data_multi_id[2],
dst_data_multi_id[3],
mSrcMyThreadOffset,
mDstMyThreadOffset);
}
#endif
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto cluster_per_dims =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto cluster_per_dims =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
constexpr auto thread_sub_tensor_desc =
make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides());
#if 1
for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0)
{
for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1)
{
for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2)
{
for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3);
++icluster_d3)
{
const index_t src_offset = SrcDesc{}.Get1dIndex(
icluster_d0 * src_data_per_cluster_per_dims.Get(I0),
icluster_d1 * src_data_per_cluster_per_dims.Get(I1),
icluster_d2 * src_data_per_cluster_per_dims.Get(I2),
icluster_d3 * src_data_per_cluster_per_dims.Get(I3));
const index_t clipboard_offset = thread_tensor_desc.Get1dIndex(
icluster_d0 * thread_sub_tensor_lengths.Get(I0),
icluster_d1 * thread_sub_tensor_lengths.Get(I1),
icluster_d2 * thread_sub_tensor_lengths.Get(I2),
icluster_d3 * thread_sub_tensor_lengths.Get(I3));
threadwise_nd_tensor_copy(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc,
p_clipboard + clipboard_offset,
thread_sub_tensor_lengths,
Number<SrcDataPerRead>{});
}
}
}
}
#else
static_ford<decltype(cluster_per_dims)>{}([=](auto cluster_ids) {
});
#endif
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"data: %f %f %f %f %f %f %f %f\n",
get_thread_local_1d_id(),
p_clipboard[0],
p_clipboard[1],
p_clipboard[2],
p_clipboard[3],
p_clipboard[4],
p_clipboard[5],
p_clipboard[6],
p_clipboard[7]);
}
#endif
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto cluster_per_dims =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
constexpr auto thread_sub_tensor_desc =
make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides());
for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0)
{
for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1)
{
for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2)
{
for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3);
++icluster_d3)
{
const index_t clipboard_offset = thread_tensor_desc.Get1dIndex(
icluster_d0 * thread_sub_tensor_lengths.Get(I0),
icluster_d1 * thread_sub_tensor_lengths.Get(I1),
icluster_d2 * thread_sub_tensor_lengths.Get(I2),
icluster_d3 * thread_sub_tensor_lengths.Get(I3));
const auto dst_multi_id = reorder_array_given_new2old(
Array<index_t, nDim>{
icluster_d0 * src_data_per_cluster_per_dims.Get(I0),
icluster_d1 * src_data_per_cluster_per_dims.Get(I1),
icluster_d2 * src_data_per_cluster_per_dims.Get(I2),
icluster_d3 * src_data_per_cluster_per_dims.Get(I3)},
MapDst2Src{});
const index_t dst_offset = DstDesc{}.Get1dIndex(dst_multi_id);
#if 0
if(get_block_1d_id() == 0)
{
printf("tid %5u, "
"clipboard_offsetm %5u, dst_offset %5u\n",
get_thread_local_1d_id(),
clipboard_offset,
dst_offset);
}
#endif
threadwise_4d_tensor_copy_reorder_given_dst2src_v2(
thread_tensor_desc,
p_clipboard + clipboard_offset,
DstDesc{},
p_dst + dst_offset + mDstMyThreadOffset,
thread_sub_tensor_lengths,
MapDst2Src{});
}
}
}
}
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
};
...@@ -53,7 +53,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -53,7 +53,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr index_t M = a_block_mtx.NCol(); // A is transposed constexpr index_t M = a_block_mtx.NCol(); // A is transposed
constexpr index_t N = b_block_mtx.NCol(); constexpr index_t N = b_block_mtx.NCol();
constexpr index_t K = a_block_mtx.NRow();
constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow();
constexpr index_t NPerThread = c_thread_mtx.NCol(); constexpr index_t NPerThread = c_thread_mtx.NCol();
...@@ -114,8 +113,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -114,8 +113,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
__device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const
{ {
constexpr index_t BatchThreadWork = BatchSize / BatchPerThread;
constexpr index_t ThreadPerLevel1Cluster = constexpr index_t ThreadPerLevel1Cluster =
MLevel0Cluster * NLevel0Cluster * MLevel1Cluster * NLevel1Cluster; MLevel0Cluster * NLevel0Cluster * MLevel1Cluster * NLevel1Cluster;
......
#pragma once
#include "threadwise_nd_tensor_op.hip.hpp"
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SrcLengths,
class SrcSubLengths,
class SrcClusterLengths,
class MapDst2Src,
class MapThreadCluster2SrcCluster,
index_t SrcDataPerRead,
index_t DstDataPerWrite>
struct BlockwiseNdTensorCopyReorder_v3
{
static constexpr index_t nDim = SrcLengths::GetSize();
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ BlockwiseNdTensorCopyReorder_v3()
{
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto src_lengths = SrcLengths{};
constexpr auto map_dst2src = MapDst2Src{};
constexpr auto src_sub_lengths = SrcSubLengths{};
constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src);
constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{};
constexpr auto src_cluster_lengths = SrcClusterLengths{};
constexpr auto thread_cluster_lengths =
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
// sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
// sanity check: nDim
static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim &&
SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim &&
SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim &&
MapThreadCluster2SrcCluster::GetSize() == nDim,
"wrong! nDim is not consistent\n");
// sanity check: BlockSize
constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize();
static_assert(BlockSize >= num_active_thread,
"wrong! BlockSize is not big enough for ThreadPerDims!");
// sanity check: work division
static_for<0, nDim, 1>{}([](auto IDim) {
constexpr auto I = decltype(IDim){};
constexpr index_t src_len = src_lengths.Get(I);
constexpr index_t src_sub_len = src_sub_lengths.Get(I);
constexpr index_t src_cluster_len = src_cluster_lengths.Get(I);
static_assert(src_len % (src_sub_len * src_cluster_len) == 0,
"wrong! cannot evenly divide Src tensor lengths");
});
// sanity check: src read
static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4,
"wrong! only support SrcDataPerRead == 1, 2 or 4!\n");
static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number<nDim - 1>{}) == 1,
"wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n");
static_assert(src_sub_lengths.Get(Number<nDim - 1>{}) % SrcDataPerRead == 0,
"wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n");
static_assert(src_desc.GetStride(Number<nDim - 2>{}) % SrcDataPerRead == 0,
"wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
"keep alignment");
// sanity check: dst write
static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
"wrong! only support DstDataPerWrite == 1, 2 or 4!\n");
static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number<nDim - 1>{}) == 1,
"wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n");
static_assert(dst_sub_lengths.Get(Number<nDim - 1>{}) % DstDataPerWrite == 0,
"wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n");
static_assert(dst_desc.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
"wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
"keep alignment");
// start dividing work
if(BlockSize > num_active_thread)
{
if(get_thread_local_1d_id() >= num_active_thread)
{
return;
}
}
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
// regsiters, or only one copy???
auto src_data_multi_id =
reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto I = decltype(IDim){};
constexpr index_t i = I.Get();
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
src_data_multi_id[i] *= src_sub_lengths.Get(I);
});
// compiler: will it really compute index here, or be associated with Get1dIndex and
// optimized away???
const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id);
mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id);
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto repeat_lengths =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto repeat_lengths =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_data_multi_id = transform_sequences(
mod_conv::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
constexpr auto clipboard_data_multi_id = transform_sequences(
mod_conv::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id);
constexpr index_t clipboard_offset =
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
threadwise_nd_tensor_copy(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc,
p_clipboard + clipboard_offset,
thread_sub_tensor_lengths,
Number<SrcDataPerRead>{});
});
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
constexpr auto src_data_per_cluster_per_dims = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
constexpr auto repeat_lengths =
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
SrcLengths{},
src_data_per_cluster_per_dims);
constexpr auto thread_tensor_lengths = transform_sequences(
mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id = transform_sequences(
mod_conv::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
constexpr auto src_data_multi_id = transform_sequences(
mod_conv::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
// reorder src_data_multi_id to get dst_data_multi_id
constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
constexpr index_t clipboard_offset =
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id);
// write in the order of dst
#if 1
threadwise_nd_tensor_copy_reorder_given_dst2src_v2(thread_tensor_desc,
p_clipboard + clipboard_offset,
DstDesc{},
p_dst + dst_offset +
mDstMyThreadOffset,
thread_sub_tensor_lengths,
MapDst2Src{});
#else
threadwise_nd_tensor_copy_reorder_given_dst2src_v3(thread_tensor_desc,
p_clipboard + clipboard_offset,
DstDesc{},
p_dst + dst_offset +
mDstMyThreadOffset,
thread_sub_tensor_lengths,
MapDst2Src{},
Number<DstDataPerWrite>{});
#endif
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
};
...@@ -73,7 +73,6 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 ...@@ -73,7 +73,6 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
__host__ __device__ constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc) __host__ __device__ constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
{ {
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{}; constexpr auto out_desc = OutDesc{};
......
#pragma once #pragma once
#include "config.h" #include "config.h"
#include "constant_integral.hip.hpp"
template <class T, index_t N> template <class T, index_t N>
struct vector_type struct vector_type
...@@ -10,6 +11,13 @@ template <> ...@@ -10,6 +11,13 @@ template <>
struct vector_type<float, 1> struct vector_type<float, 1>
{ {
typedef float MemoryType; typedef float MemoryType;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
}; };
template <> template <>
...@@ -20,21 +28,29 @@ struct vector_type<float, 2> ...@@ -20,21 +28,29 @@ struct vector_type<float, 2>
// instruction // instruction
typedef float MemoryType __attribute__((ext_vector_type(2))); typedef float MemoryType __attribute__((ext_vector_type(2)));
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise // For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and // compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct // kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction, // instruction,
using MemoryType = float2; using MemoryType = float2;
#endif #endif
__host__ __device__ static MemoryType Pack(float s0, float s1) union Data
{ {
union MemoryType vector;
{ float scalar[2];
MemoryType vector; };
float scalar[2];
} data;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
Data data;
data.scalar[0] = s0; data.scalar[0] = s0;
data.scalar[1] = s1; data.scalar[1] = s1;
return data.vector; return data.vector;
...@@ -49,12 +65,19 @@ struct vector_type<float, 4> ...@@ -49,12 +65,19 @@ struct vector_type<float, 4>
// instruction // instruction
typedef float MemoryType __attribute__((ext_vector_type(4))); typedef float MemoryType __attribute__((ext_vector_type(4)));
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise // For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and // compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct // kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction, // instruction,
using MemoryType = float4; using MemoryType = float4;
#endif #endif
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
}; };
#if 0 #if 0
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "ConstantMatrixDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_3d_tensor_op.hip.hpp" #include "blockwise_3d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_nd_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp" #include "blockwise_batched_gemm.hip.hpp"
...@@ -125,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn ...@@ -125,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
const auto blockwise_in_copy_reorder = const auto blockwise_in_copy_reorder =
Blockwise4dTensorCopyReorder3<BlockSize, BlockwiseNdTensorCopyReorder_v3<BlockSize,
Float, Float,
decltype(in_n_c_h_w_global_desc), decltype(in_n_c_h_w_global_desc),
decltype(in_c_h_w_n_block_desc), decltype(in_c_h_w_n_block_desc),
Sequence<NPerBlock, CPerBlock, HoPerBlock, WiPerBlock>, Sequence<NPerBlock, CPerBlock, HoPerBlock, WiPerBlock>,
InBlockReorderSrcSubLengths_NCHW, InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW, InBlockReorderSrcClusterLengths_NCHW,
decltype(map_chwn2nchw), decltype(map_chwn2nchw),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W, InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N>{}; InBlockReorderDataPerWrite_N>{};
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock, X * KPerBlock] // format is [CPerBlock, X * KPerBlock]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_nd_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp" #include "blockwise_batched_gemm.hip.hpp"
...@@ -133,17 +133,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -133,17 +133,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
const auto blockwise_in_copy_reorder = const auto blockwise_in_copy_reorder =
Blockwise4dTensorCopyReorder3<BlockSize, BlockwiseNdTensorCopyReorder_v3<BlockSize,
Float, Float,
decltype(in_n_c_h_w_global_desc), decltype(in_n_c_h_w_global_desc),
decltype(in_c_h_w_n_block_desc), decltype(in_c_h_w_n_block_desc),
Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>, Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
InBlockReorderSrcSubLengths_NCHW, InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW, InBlockReorderSrcClusterLengths_NCHW,
decltype(map_chwn2nchw), decltype(map_chwn2nchw),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W, InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N>{}; InBlockReorderDataPerWrite_N>{};
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock, KPerBlock] // format is [CPerBlock, KPerBlock]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp"
#include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_nd_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_batched_gemm.hip.hpp" #include "blockwise_batched_gemm.hip.hpp"
...@@ -130,17 +130,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn ...@@ -130,17 +130,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
const auto blockwise_in_copy_reorder = const auto blockwise_in_copy_reorder =
Blockwise4dTensorCopyReorder3<BlockSize, BlockwiseNdTensorCopyReorder_v3<BlockSize,
Float, Float,
decltype(in_n_c_h_w_global_desc), decltype(in_n_c_h_w_global_desc),
decltype(in_c_h_w_n_block_desc), decltype(in_c_h_w_n_block_desc),
Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>, Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
InBlockReorderSrcSubLengths_NCHW, InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW, InBlockReorderSrcClusterLengths_NCHW,
decltype(map_chwn2nchw), decltype(map_chwn2nchw),
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W, InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N>{}; InBlockReorderDataPerWrite_N>{};
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock, KPerBlock] // format is [CPerBlock, KPerBlock]
......
...@@ -139,135 +139,6 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc, ...@@ -139,135 +139,6 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc,
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
} }
#if 0 // replaced threadwise_nd_tensor_copy
template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
__device__ void threadwise_4d_tensor_copy(
SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
{
auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
threadwise_4d_tensor_copy_reorder_given_dst2src(
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
}
// need to assume src and dst is aligned
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
__device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
const Float* __restrict__ p_src,
DstDesc,
Float* __restrict__ p_dst,
SrcOpLengths,
Number<DataPerRead>)
{
static_assert(SrcDesc{}.GetDimension() == 4 && DstDesc{}.GetDimension() == 4 &&
SrcOpLengths::GetSize() == 4,
"wrong! should be 4 dimension");
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
"wrong! only support stride3 == 1!\n");
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
"wrong! only support DataPerRead == 1, 2 or 4!\n");
static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
DstDesc{}.GetStride(I2) % DataPerRead == 0,
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
constexpr index_t L3 = SrcOpLengths{}.Get(I3);
static_assert(L3 % DataPerRead == 0, "wrong! L3 should be evenly divided by DataPerRead");
constexpr index_t nloop_d3 = L3 / DataPerRead;
for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
{
for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
{
for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
{
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
{
const index_t src_index =
src_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
const index_t dst_index =
dst_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
*(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
*(reinterpret_cast<const vector_t*>(&p_src[src_index]));
}
}
}
}
}
#endif
template <class SrcData,
class DstData,
class SrcDesc,
class DstDesc,
class SrcOpLengths,
class MapDst2Src>
__device__ void
threadwise_4d_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
const SrcData* __restrict__ p_src,
DstDesc,
DstData* __restrict__ p_dst,
SrcOpLengths,
MapDst2Src)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr index_t IR0 = MapDst2Src{}.Get(I0);
constexpr index_t IR1 = MapDst2Src{}.Get(I1);
constexpr index_t IR2 = MapDst2Src{}.Get(I2);
constexpr index_t IR3 = MapDst2Src{}.Get(I3);
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
// ref_desc has dst_desc's ordering
constexpr auto ref_desc =
make_ConstantTensorDescriptor(SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{}));
for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
{
for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
{
for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
{
for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
{
const auto dst_multi_id = Array<index_t, 4>{did0, did1, did2, did3};
const auto src_multi_id =
reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
p_dst[dst_index] = p_src[src_index];
}
}
}
}
}
template <class Float, class Desc, class IDim, class NShift> template <class Float, class Desc, class IDim, class NShift>
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift) __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
{ {
......
...@@ -50,7 +50,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc, ...@@ -50,7 +50,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
constexpr index_t nRead = L_Back / DataPerRead; constexpr index_t nRead = L_Back / DataPerRead;
static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) { static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
static_for<0, nRead, 1>{}([=](auto IRead) { static_for<0, nRead, 1>{}([&](auto IRead) {
constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{}); constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
const index_t src_index = src_desc.Get1dIndex(multi_id); const index_t src_index = src_desc.Get1dIndex(multi_id);
...@@ -62,3 +62,131 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc, ...@@ -62,3 +62,131 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
}); });
}); });
} }
// write in order of src
template <class SrcData,
class DstData,
class SrcDesc,
class DstDesc,
class SrcOpLengths,
class MapDst2Src>
__device__ void
threadwise_nd_tensor_copy_reorder_given_dst2src_v1(SrcDesc,
const SrcData* __restrict__ p_src,
DstDesc,
DstData* __restrict__ p_dst,
SrcOpLengths,
MapDst2Src)
{
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
ford<SrcOpLengths>{}([&](auto src_multi_id) {
const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{});
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
p_dst[dst_index] = p_src[src_index];
});
}
// write in order of dst
template <class SrcData,
class DstData,
class SrcDesc,
class DstDesc,
class SrcOpLengths,
class MapDst2Src>
__device__ void
threadwise_nd_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
const SrcData* __restrict__ p_src,
DstDesc,
DstData* __restrict__ p_dst,
SrcOpLengths,
MapDst2Src)
{
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
ford<decltype(dst_op_lengths)>{}([&](auto dst_multi_id) {
const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
p_dst[dst_index] = p_src[src_index];
});
}
// write in order of dst
template <class Float,
class SrcDesc,
class DstDesc,
class SrcOpLengths,
class MapDst2Src,
index_t DstDataPerWrite>
__device__ void threadwise_nd_tensor_copy_reorder_given_dst2src_v3(SrcDesc,
const Float* __restrict__ p_src,
DstDesc,
Float* __restrict__ p_dst,
SrcOpLengths,
MapDst2Src,
Number<DstDataPerWrite>)
{
using vector_t = typename vector_type<Float, DstDataPerWrite>::MemoryType;
constexpr index_t nDim = SrcOpLengths::GetSize();
static_assert(DstDataPerWrite == 1 || DstDesc{}.GetStride(Number<nDim - 1>{}) == 1,
"wrong! only support dst.stride[nDim-1] == 1, if DstDataPerWrite != 1");
static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
"wrong! only support DstDataPerWrite == 1, 2 or 4");
static_assert(
DstDesc{}.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
"wrong! dst.stride[nDim-2] should be multiple of DstDataPerWrite to keep alignment");
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
constexpr index_t L_Dst_Back = dst_op_lengths.Back();
static_assert(L_Dst_Back % DstDataPerWrite == 0,
"wrong! dst.lengths[nDim-1] should be evenly divided by DstDataPerWrite");
constexpr index_t nWrite = L_Dst_Back / DstDataPerWrite;
ford<decltype(dst_op_lengths.PopBack())>{}([&](auto ids) {
static_for<0, nWrite, 1>{}([&](auto IWrite) {
vector_t dst_vec_data;
// pack data
static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) {
const auto dst_multi_id =
ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get());
const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
vector_type<Float, DstDataPerWrite>::SetScalar(
dst_vec_data, p_src[src_index], IDstData);
});
// write data
const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
*(reinterpret_cast<vector_t*>(&p_dst[dst_index])) = dst_vec_data;
});
});
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment