"...composable_kernel-1.git" did not exist on "bd7a2300061e26092edfe28c605c32101f0ec9e8"
Commit a9a392b4 authored by Chao Liu's avatar Chao Liu
Browse files

experimenting TensorCoordinate and new merged tensor copy operator

parent 2eeeb176
...@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied"); "GemmDataPerReadB alignment requirement is not satisfied");
#if 0 // debug
// input blockwise copy // input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor // slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in // this copy operator already has blockwise offset built-in
...@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
InBlockCopySrcDataPerRead_B, InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>( InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#else
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(in_e_n1_b_n2_global_merged_desc),
decltype(in_e_n1_b_n2_block_desc),
MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
NormalTensorCoordinate<decltype(in_e_n1_b_n2_block_desc)>,
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
#endif
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
...@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
Sequence<EPerBlock, KPerBlock>{}, Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{}); Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 0 // debug
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
...@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
WeiBlockCopySrcDataPerRead_E, WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>( WeiBlockCopyDstDataPerWrite_K>(
{0, k_block_data_on_global}, {0, 0}); {0, k_block_data_on_global}, {0, 0});
#else
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
BlockSize,
Float,
decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc),
NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
decltype(wei_e_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
#endif
// GEMM definition // GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx // c_mtx += transpose(a_mtx) * b_mtx
...@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
__syncthreads(); __syncthreads();
#if 0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
#endif
} }
// copy output: register to global memory // copy output: register to global memory
...@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 0 // debug
threadwise_generic_tensor_slice_copy_v1( threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
...@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw ...@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{}, arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{}); Number<1>{});
#else
ThreadwiseGenericTensorSliceCopy_v2<
Float,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run(p_out_thread, p_out_thread_on_global);
#endif
} }
} }
}; };
......
...@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
b_thread_data_on_global % B1}); b_thread_data_on_global % B1});
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global); threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
#elif 0
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// become available
// dst descriptor
constexpr auto out_k0_k1_b_global_desc =
make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
Sequence<1>{},
Sequence<2>{},
Sequence<0, 3, 4>{});
// src descriptor
constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
Float,
decltype(out_k0_k1_b_thread_desc),
decltype(out_k0_k1_b_global_desc),
NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>>(
{0, 0, 0},
{k_thread_data_on_global / K1,
k_thread_data_on_global % K1,
b_thread_data_on_global});
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
{
threadwise_out_copy.Run(p_out_thread, p_out_global);
threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
}
#elif 1 #elif 1
// This is a hack, because slicing a merged dimension is not supported yet. // This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support // This should be replaced with logic above, once slicing a merged dimension support
......
...@@ -16,7 +16,7 @@ struct NormalTensorCoordinate ...@@ -16,7 +16,7 @@ struct NormalTensorCoordinate
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
__host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index) __host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index)
: mIndex{tensor_index}, mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
{ {
} }
...@@ -26,38 +26,15 @@ struct NormalTensorCoordinate ...@@ -26,38 +26,15 @@ struct NormalTensorCoordinate
{ {
} }
__host__ __device__ constexpr Array<unsigned, nDim> GetIndex() const { return mIndex; }
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; } __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
template <class IDim, bool PositiveDirection>
__host__ __device__ void
MoveOnDimension(IDim idim, index_t step_size, integral_constant<bool, PositiveDirection>)
{
if(PositiveDirection)
{
mIndex(idim) += step_size;
mOffset += step_size * tensor_desc_type::GetStride(idim);
}
else
{
mIndex(idim) -= step_size;
mOffset -= step_size * tensor_desc_type::GetStride(idim);
}
}
// T is Array or Sequence // T is Array or Sequence
template <class T> template <class T>
__host__ __device__ type operator+=(T step_sizes) __host__ __device__ type operator+=(T step_sizes)
{ {
#if 0 static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert(T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) { mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
});
return *this; return *this;
} }
...@@ -65,14 +42,9 @@ struct NormalTensorCoordinate ...@@ -65,14 +42,9 @@ struct NormalTensorCoordinate
template <class T> template <class T>
__host__ __device__ type operator-=(T step_sizes) __host__ __device__ type operator-=(T step_sizes)
{ {
#if 0 static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert(T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) { mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
});
return *this; return *this;
} }
...@@ -93,19 +65,25 @@ struct NormalTensorCoordinate ...@@ -93,19 +65,25 @@ struct NormalTensorCoordinate
return coord; return coord;
} }
// reposition point of origin, and return compensated offset // reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__ __device__ constexpr index_t RepositionOrigin() __host__ __device__ constexpr index_t RepositionOrigin()
{ {
index_t offset_diff = mOffset; index_t offset_diff = mOffset;
mOffset = 0;
mIndex = make_zero_array<index_t, nDim>();
mOffset = 0;
return offset_diff; return offset_diff;
} }
// private: private:
Array<index_t, nDim> mIndex;
index_t mOffset; index_t mOffset;
}; };
...@@ -120,8 +98,7 @@ struct MergedTensorCoordinate ...@@ -120,8 +98,7 @@ struct MergedTensorCoordinate
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index) __host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
: mIndex{tensor_index}, : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
{ {
// partial offset on each dimension // partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto idim) { static_for<0, nDim, 1>{}([&](auto idim) {
...@@ -146,8 +123,6 @@ struct MergedTensorCoordinate ...@@ -146,8 +123,6 @@ struct MergedTensorCoordinate
{ {
} }
__host__ __device__ constexpr Array<index_t, nDim> GetIndex() const { return mIndex; }
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; } __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
// step_size should be known at compile time // step_size should be known at compile time
...@@ -157,17 +132,7 @@ struct MergedTensorCoordinate ...@@ -157,17 +132,7 @@ struct MergedTensorCoordinate
{ {
constexpr auto idim = IDim{}; constexpr auto idim = IDim{};
// update multi-index // update original index
if(PositiveDirection)
{
mIndex(idim) += step_size;
}
else
{
mIndex(idim) -= step_size;
}
// update rest
static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) { static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
constexpr auto partial_original_dims = constexpr auto partial_original_dims =
tensor_desc_type::GetContainedOriginalDimensions(idim); tensor_desc_type::GetContainedOriginalDimensions(idim);
...@@ -253,19 +218,10 @@ struct MergedTensorCoordinate ...@@ -253,19 +218,10 @@ struct MergedTensorCoordinate
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
}).Else([&](auto) { }).Else([&](auto fwd) {
constexpr auto idim_original = static_if<PositiveDirection>{}([&](auto) {
tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
static_if<PositiveDirection>{}([&](auto fwd) {
mOriginalIndex(idim_original) += step_size;
mPartialOffsets(idim) += step_size * fwd(tensor_desc_type{}).GetStride(idim);
mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
}).Else([&](auto fwd) { }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
mOriginalIndex(idim_original) -= step_size;
mPartialOffsets(idim) -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
});
}); });
} }
...@@ -273,10 +229,9 @@ struct MergedTensorCoordinate ...@@ -273,10 +229,9 @@ struct MergedTensorCoordinate
template <class T> template <class T>
__host__ __device__ type operator+=(T step_sizes) __host__ __device__ type operator+=(T step_sizes)
{ {
#if 0 static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif index_t normal_offset_diff = 0;
static_assert(T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) { static_for<0, nDim, 1>{}([&](auto idim) {
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{}); this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
...@@ -288,10 +243,7 @@ struct MergedTensorCoordinate ...@@ -288,10 +243,7 @@ struct MergedTensorCoordinate
template <class T> template <class T>
__host__ __device__ type operator-=(T step_sizes) __host__ __device__ type operator-=(T step_sizes)
{ {
#if 0 static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert(T::GetSize() == nDim, "wrong!");
static_for<0, nDim, 1>{}([&](auto idim) { static_for<0, nDim, 1>{}([&](auto idim) {
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{}); this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
...@@ -316,33 +268,23 @@ struct MergedTensorCoordinate ...@@ -316,33 +268,23 @@ struct MergedTensorCoordinate
return coord; return coord;
} }
// reposition point of origin, and return compensated offset __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
__host__ __device__ constexpr index_t RepositionOrigin()
{ private:
index_t offset_diff = 0; // Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
static_for<0, nDim, 1>{}([&](auto idim_) { // after the construction of this TensorCoordinate (e.g. when user move a slicing
constexpr auto idim = decltype(idim_){}; // window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
static_if<!tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) { // dimensions, and those merged dimensions, that would never be involved in index
constexpr auto idim_original = // arithmetic after construction of TensorCoordinate.
tensor_desc_type::GetContainedOriginalDimensions(idim).Front(); // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
mIndex(idim) = 0; // count on compiler to optimize way those register memory for us
mOriginalIndex(idim_original) = 0;
mOffset -= mPartialOffsets[idim];
offset_diff += mPartialOffsets[idim];
mPartialOffsets(idim) = 0;
});
});
return offset_diff;
}
// private:
Array<index_t, nDim> mIndex;
Array<index_t, nOriginalDim> mOriginalIndex; Array<index_t, nOriginalDim> mOriginalIndex;
Array<index_t, nDim> mPartialOffsets; // mPartialOffsets is needed for for unsigned index type Array<index_t, nDim> mPartialOffsets;
// complete offset
index_t mOffset; index_t mOffset;
}; };
......
...@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 0 #if 1
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
......
...@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1
// 1x1 filter, 8x8 image
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128;
constexpr index_t KPerBlock = 128;
constexpr index_t EPerBlock = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockCopySubLengths_E_B = Sequence<2, 2>;
using InBlockCopyClusterLengths_E_B = Sequence<4, 64>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t InBlockCopyDataPerAccess_B = 1;
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#endif #endif
...@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv =
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<
GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
......
...@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) ...@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 0 #if 1
constexpr index_t N = 64; constexpr index_t N = 64;
constexpr index_t C = 1536; constexpr index_t C = 1536;
constexpr index_t HI = 8; constexpr index_t HI = 8;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment