Commit f2f35201 authored by Chao Liu's avatar Chao Liu
Browse files

add hack for merge transformation

parent fa479ce4
...@@ -64,6 +64,20 @@ struct DynamicPassThrough ...@@ -64,6 +64,20 @@ struct DynamicPassThrough
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}]; idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -141,6 +155,20 @@ struct DynamicLeftPad ...@@ -141,6 +155,20 @@ struct DynamicLeftPad
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}]; idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -228,6 +256,20 @@ struct DynamicRightPad ...@@ -228,6 +256,20 @@ struct DynamicRightPad
idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}]; idx_diff_low(Number<0>{}) = idx_diff_up[Number<0>{}];
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -324,6 +366,20 @@ struct DynamicEmbed ...@@ -324,6 +366,20 @@ struct DynamicEmbed
[&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; }); [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -489,6 +545,141 @@ struct DynamicMerge ...@@ -489,6 +545,141 @@ struct DynamicMerge
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry; idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
} }
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& /* idx_up_old */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
}
}
__host__ __device__ static constexpr bool IsLinearTransform() { return false; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -551,6 +742,20 @@ struct DynamicUnMerge ...@@ -551,6 +742,20 @@ struct DynamicUnMerge
CalculateLowerIndex(idx_diff_low, idx_diff_up); CalculateLowerIndex(idx_diff_low, idx_diff_up);
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
...@@ -602,6 +807,20 @@ struct DynamicFreeze ...@@ -602,6 +807,20 @@ struct DynamicFreeze
idx_diff_low(Number<0>{}) = index_t{Number<0>{}}; idx_diff_low(Number<0>{}) = index_t{Number<0>{}};
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ static void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>)
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
......
...@@ -9,30 +9,13 @@ namespace ck { ...@@ -9,30 +9,13 @@ namespace ck {
template <index_t NDimHidden, typename VisibleDimensionIds> template <index_t NDimHidden, typename VisibleDimensionIds>
struct DynamicTensorCoordinate; struct DynamicTensorCoordinate;
#if 0 // hack
template <index_t NTransform, index_t NDimVisible> template <index_t NTransform, index_t NDimVisible>
#else
template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
#endif
struct DynamicTensorCoordinateStep; struct DynamicTensorCoordinateStep;
template <typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDesc& tensor_desc,
const VisibleIndex& idx_visible);
template <typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto
make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_diff_visible);
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ constexpr void move_dynamic_tensor_coordinate(
const TensorDesc& tensor_desc, TensorCoord& coord, const TensorCoordStep& coord_step);
template <typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool
coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& tensor_desc,
const TensorCoord& coord);
template <typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& tensor_desc,
const TensorCoord& coord);
// Transforms: Tuple<transforms...> // Transforms: Tuple<transforms...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...> // LowerDimensionIdss : Tuple<Sequence<...>, ...>
// UpperDimensionIdss : Tuple<Sequence<...>, ...> // UpperDimensionIdss : Tuple<Sequence<...>, ...>
...@@ -74,10 +57,9 @@ struct DynamicTensorDescriptor ...@@ -74,10 +57,9 @@ struct DynamicTensorDescriptor
constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension(); constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension(); constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
using VisibleIndex = MultiIndex<ndim_visible_>; using VisibleIndex = MultiIndex<ndim_visible_>;
using HiddenIndex = MultiIndex<ndim_hidden_>; using HiddenIndex = MultiIndex<ndim_hidden_>;
using Coordinate = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>; using Coordinate = DynamicTensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
using CoordinateStep = DynamicTensorCoordinateStep<ntransform_, ndim_visible_>;
public: public:
__host__ __device__ explicit constexpr DynamicTensorDescriptor(const Transforms& transforms, __host__ __device__ explicit constexpr DynamicTensorDescriptor(const Transforms& transforms,
...@@ -211,7 +193,11 @@ struct DynamicTensorCoordinate ...@@ -211,7 +193,11 @@ struct DynamicTensorCoordinate
HiddenIndex idx_hidden_; HiddenIndex idx_hidden_;
}; };
#if 0 // hack
template <index_t NTransform, index_t NDimVisible> template <index_t NTransform, index_t NDimVisible>
#else
template <index_t NTransform, index_t NDimVisible, typename HackCalculateLowerIndexDiff>
#endif
struct DynamicTensorCoordinateStep struct DynamicTensorCoordinateStep
{ {
// TODO make these private // TODO make these private
...@@ -234,6 +220,11 @@ struct DynamicTensorCoordinateStep ...@@ -234,6 +220,11 @@ struct DynamicTensorCoordinateStep
const VisibleIndex idx_diff_visible_; const VisibleIndex idx_diff_visible_;
const MultiIndex<NTransform> do_transforms_; const MultiIndex<NTransform> do_transforms_;
#if 1 // hack
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
static constexpr HackCalculateLowerIndexDiff hack_calculate_lower_index_diff_;
#endif
}; };
// TODO: How to fix this? It uses an struct instead of lambda because lambda // TODO: How to fix this? It uses an struct instead of lambda because lambda
...@@ -406,7 +397,72 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d ...@@ -406,7 +397,72 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low); set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
}); });
#if 0 // hack
return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms}; return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
#else
return DynamicTensorCoordinateStep<ntransform,
ndim_visible,
typename uniform_sequence_gen<ntransform, 0>::type>{
idx_diff_visible, do_transforms};
#endif
}
#if 0 // hack
template <typename TensorDesc, typename VisibleIndex>
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
template <typename TensorDesc, typename VisibleIndex, typename HackCalculateLowerIndexDiff>
#endif
__host__ __device__ constexpr auto make_dynamic_tensor_coordinate_step_hack(
const TensorDesc&, const VisibleIndex& idx_diff_visible, HackCalculateLowerIndexDiff)
{
static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
"wrong! # of dimension inconsistent");
constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
static_assert(HackCalculateLowerIndexDiff::Size() == ntransform, "wrong!");
// use index_t for boolean type
auto do_transforms = make_zero_multi_index<ntransform>();
auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
// decide do_transform by checkout non-zero index diff components
MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
static_for<0, ndim_visible, 1>{}(
[&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
constexpr auto dims_up = TensorDesc::GetUpperDimensionIdss().At(itran);
const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
// if any of upper index diff components is non-zero, then
// 1) Need to do this transform
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const bool idx_diff_up_has_non_zero = container_reduce(
non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
do_transforms(itran) = idx_diff_up_has_non_zero;
static_for<0, dims_low.Size(), 1>{}(
[&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
});
return DynamicTensorCoordinateStep<ntransform, ndim_visible, HackCalculateLowerIndexDiff>{
idx_diff_visible, do_transforms};
} }
template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep> template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
...@@ -453,7 +509,17 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe ...@@ -453,7 +509,17 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
MultiIndex<dims_low.Size()> idx_diff_low; MultiIndex<dims_low.Size()> idx_diff_low;
// calculate idx_diff_low // calculate idx_diff_low
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up); tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// TODO remove hack
constexpr index_t Hack =
decltype(coord_step.hack_calculate_lower_index_diff_)::At(itran);
tran.CalculateLowerIndexDiff_hack(
idx_diff_low, idx_diff_up, idx_low, idx_up, Number<Hack>{});
#endif
// update idx_low // update idx_low
idx_low += idx_diff_low; idx_low += idx_diff_low;
......
...@@ -149,5 +149,146 @@ struct BlockwiseDynamicTensorSliceTransfer_v4 ...@@ -149,5 +149,146 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
ThreadwiseTransfer threadwise_transfer_; ThreadwiseTransfer threadwise_transfer_;
}; };
// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template <index_t BlockSize,
InMemoryDataOperation DstInMemOp,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
index_t ThreadTransferSrcResetCoordinateAfterRun,
index_t ThreadTransferDstResetCoordinateAfterRun>
struct BlockwiseDynamicTensorSliceTransfer_v4_hack
{
static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v4_hack(
const SrcDesc& src_desc,
const Index& src_block_slice_origin,
const DstDesc& dst_desc,
const Index& dst_block_slice_origin)
: threadwise_transfer_(
src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
{
static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_id_begin);
threadwise_transfer_.SetDstSliceOrigin(dst_desc,
dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ static constexpr auto CalculateThreadDataBegin()
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
return thread_cluster_id * ThreadSliceLengths{};
}
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.RunRead(src_desc, p_src);
}
}
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.RunWrite(dst_desc, p_dst);
}
}
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
}
}
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
}
}
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer =
ThreadwiseDynamicTensorSliceTransfer_v3_hack<ThreadSliceLengths,
DstInMemOp,
SrcData,
DstData,
SrcDesc,
DstDesc,
SrcDimAccessOrder,
DstDimAccessOrder,
SrcVectorDim,
DstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcScalarStrideInVector,
DstScalarStrideInVector,
SrcAddressSpace,
DstAddressSpace,
ThreadTransferSrcResetCoordinateAfterRun,
ThreadTransferDstResetCoordinateAfterRun>;
ThreadwiseTransfer threadwise_transfer_;
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -166,28 +166,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -166,28 +166,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// B matrix blockwise copy // B matrix blockwise copy
auto b_blockwise_copy = auto b_blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v4<BlockSize, BlockwiseDynamicTensorSliceTransfer_v4_hack<BlockSize,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
Sequence<KPerBlock, NPerBlock>, Sequence<KPerBlock, NPerBlock>,
BBlockTransferThreadSliceLengths_K_N, BBlockTransferThreadSliceLengths_K_N,
BBlockTransferThreadClusterLengths_K_N, BBlockTransferThreadClusterLengths_K_N,
BBlockTransferThreadClusterArrangeOrder, BBlockTransferThreadClusterArrangeOrder,
Float, Float,
Float, Float,
decltype(b_k_n_global_desc), decltype(b_k_n_global_desc),
decltype(b_k_n_block_desc), decltype(b_k_n_block_desc),
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
Sequence<0, 1>, Sequence<0, 1>,
BBlockTransferSrcVectorDim, BBlockTransferSrcVectorDim,
1, 1,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_N, BBlockTransferDstScalarPerVector_N,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Lds, AddressSpace::Lds,
1, 1,
1, 1,
BThreadTransferSrcResetCoordinateAfterRun, BThreadTransferSrcResetCoordinateAfterRun,
true>( true>(
b_k_n_global_desc, b_k_n_global_desc,
make_multi_index(0, n_block_data_on_global), make_multi_index(0, n_block_data_on_global),
b_k_n_block_desc, b_k_n_block_desc,
......
...@@ -646,5 +646,348 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3 ...@@ -646,5 +646,348 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstCoord dst_slice_origin_; DstCoord dst_slice_origin_;
}; };
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// 3. It does not use pointer for VGPR thread buffer
// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
template <typename SliceLengths,
InMemoryDataOperation DstInMemOp,
typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
// RunRead(), will be fused with MoveSrcSliceWindow to
// save addr computation
bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// save addr computation
struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
static_assert(SrcAddressSpace == AddressSpace::Global or
SrcAddressSpace == AddressSpace::Lds,
"wrong!");
static_assert(DstAddressSpace == AddressSpace::Global or
DstAddressSpace == AddressSpace::Lds,
"wrong!");
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3_hack()
: ThreadwiseDynamicTensorSliceTransfer_v3_hack(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
__device__ void RunRead(const SrcDesc& src_desc, const SrcData* p_src)
{
static_assert(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor");
// hardcoded for 2D
// TODO implemente N-D
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif 0
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, 1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{});
#elif 1
// for padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, 1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(0, -1),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc,
make_multi_index(-1, 0),
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{});
#elif 1
// for non-padded input tensor
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, 1), Sequence<0, 0, 0, 0, 0, 0, 1>{});
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(0, -1), Sequence<0, 0, 0, 0, 0, 0, 2>{});
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(1, 0), Sequence<0, 0, 0, 0, 0, 1, 0>{});
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step_hack(
src_desc, make_multi_index(-1, 0), Sequence<0, 0, 0, 0, 0, 2, 0>{});
#endif
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
static_for<0, Len0, 1>{}([&](auto iter0) {
static_for<0, Len1, 1>{}([&](auto iter1) {
// step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0);
constexpr index_t i0 = iter0.value;
constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
// do work
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert(SrcAddressSpace == AddressSpace::Global,
"wrong! hardcoded to use buffer_load, src must be global mem");
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
p_src,
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize());
// move dim1 iterator
if constexpr(iter1.value < Len1 - 1)
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
src_desc, src_slice_origin_, src_step_0_m1);
}
}
});
// move dim0 iterator
if constexpr(iter0.value < Len0 - 1)
{
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_step_p1_0);
}
});
}
// move src coordinate back to its slice origin
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_back_step =
make_dynamic_tensor_coordinate_step(src_desc, GetCoordinateBackStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_back_step);
}
}
__device__ void RunWrite(const DstDesc& dst_desc, DstData* p_dst)
{
static_assert(remove_reference_t<DstDesc>::GetNumOfDimension() == 2,
"wrong! hardcoded for 2D tensor");
// hardcoded for 2D
// TODO implement N-D
if constexpr(remove_reference_t<SrcDesc>::GetNumOfDimension() == 2)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t Len0 = SliceLengths{}[0];
constexpr index_t Len1 = SliceLengths{}[1];
static_for<0, Len0, 1>{}([&](auto iter0) {
static_for<0, Len1, 1>{}([&](auto iter1) {
// step direction
constexpr bool forward_dim1 = (iter0.value % 2 == 0);
constexpr index_t i0 = iter0;
constexpr index_t i1 = forward_dim1 ? iter1.value : Len1 - iter1.value - 1;
// do work
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
// hardcoding for ds_write
// TODO refactor transfer_data() to encapsulate this
static_assert(DstAddressSpace == AddressSpace::Lds &&
DstInMemOp == InMemoryDataOperation::Set,
"wrong! hardcoded for ds_write");
p_dst[dst_slice_origin_.GetOffset()] = buffer_[Number<buffer_offset>{}];
// move dim1 iterator
if constexpr(iter1.value < Len1 - 1)
{
if constexpr(forward_dim1)
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(
dst_desc, dst_slice_origin_, dst_step_0_m1);
}
}
});
// move dim0 iterator
if constexpr(iter0.value < Len0 - 1)
{
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_step_p1_0);
}
});
}
// move dst coordinate back to its slice origin
if constexpr(DstResetCoordinateAfterRun)
{
const auto dst_back_step =
make_dynamic_tensor_coordinate_step(dst_desc, GetCoordinateBackStep());
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_back_step);
}
}
__device__ static constexpr auto GetCoordinateBackStep()
{
MultiIndex<nDim> back_step;
back_step(Number<0>{}) = 1 - SliceLengths{}[0];
static_for<1, nDim, 1>{}([&](auto i) {
back_step(i) = (SliceLengths{}[i - Number<1>{}] % 2 == 0) ? 0 : (1 - SliceLengths{}[i]);
});
return back_step;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRun
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time?
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif 1
// for padded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{});
#elif 1
// for non-paded input tensor
const auto adjusted_step = make_dynamic_tensor_coordinate_step_hack(
src_desc, adjusted_step_idx, Sequence<0, 0, 0, 0, 0, 1, 2>{});
#endif
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRun
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateBackStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_dynamic_tensor_coordinate_step(dst_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, adjusted_step);
}
private:
static constexpr auto buffer_desc_ =
make_dynamic_naive_tensor_descriptor_packed<nDim>(to_multi_index(SliceLengths{}));
static constexpr index_t buffer_size_ = buffer_desc_.GetElementSpaceSize();
StaticallyIndexedArray<SrcData, buffer_size_> buffer_;
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -87,7 +87,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -87,7 +87,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1; constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1; constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 1 #elif 0
// cdata = 64, BlockSize = 256, 128x128x2
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 2;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockCopyThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>;
using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = 1;
constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<1, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 0
// cdata = 64, BlockSize = 256, 128x128x4 // cdata = 64, BlockSize = 256, 128x128x4
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -53,7 +53,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -53,7 +53,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
const auto in_left_pads = to_multi_index(InLeftPads{}); const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{}); const auto in_right_pads = to_multi_index(InRightPads{});
#if 1 #if 0
// cdata = 64, BlockSize = 256, 128x128x2
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 2;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 8;
constexpr index_t GemmNLevel1Cluster = 8;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<1, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<2, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0
// cdata = 64, BlockSize = 256, 128x128x4 // cdata = 64, BlockSize = 256, 128x128x4
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -138,7 +168,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -138,7 +168,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 // debug #if 1 // debug
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
...@@ -22,7 +22,7 @@ int main(int argc, char* argv[]) ...@@ -22,7 +22,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
// 3x3, 35x35, stride 2 // 3x3, 35x35, stride 2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment