Commit 68ea43b1 authored by Chao Liu's avatar Chao Liu
Browse files

add vector load and access order to threadwise copy v1r3

parent 5fdccfce
...@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad ...@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
0, 0,
GemmABlockTransferSrcScalarPerVector_GemmK, GemmABlockTransferSrcScalarPerVector_GemmK,
GemmABlockTransferDstScalarPerVector_GemmM, GemmABlockTransferDstScalarPerVector_GemmM,
true, // move back src coordinate after threadwise copy false, // don't move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN, GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN, GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
Sequence<0, 1>, Sequence<0, 1>,
......
...@@ -502,11 +502,11 @@ struct DynamicMerge ...@@ -502,11 +502,11 @@ struct DynamicMerge
typename LowIdx, typename LowIdx,
typename UpIdx, typename UpIdx,
index_t Hack> index_t Hack>
__host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low, __host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up, const UpIdxDiff& idx_diff_up,
LowIdx& idx_low, LowIdx& idx_low,
const UpIdx& /* idx_up_new */, const UpIdx& /* idx_up_new */,
Number<Hack>) const Number<Hack>) const
{ {
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 && static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1, LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
...@@ -640,6 +640,148 @@ struct DynamicMerge ...@@ -640,6 +640,148 @@ struct DynamicMerge
} }
} }
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t negative_idx_low_tmp = borrow - idx_low[i];
bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow;
borrow = do_borrow ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
idx_low += idx_diff_low;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
}
template <typename LowIdxDiff, template <typename LowIdxDiff,
typename UpIdxDiff, typename UpIdxDiff,
typename LowIdx, typename LowIdx,
...@@ -705,11 +847,15 @@ struct DynamicMerge ...@@ -705,11 +847,15 @@ struct DynamicMerge
do_carry = idx_low_tmp >= low_lengths_[i]; do_carry = idx_low_tmp >= low_lengths_[i];
#if 0
// TODO: use exec-mask inline asm // TODO: use exec-mask inline asm
if(do_carry) if(do_carry)
{ {
idx_diff_low(i) -= low_lengths_[i]; idx_diff_low(i) -= low_lengths_[i];
} }
#else
idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i]; idx_low(i) += idx_diff_low[i];
}); });
...@@ -733,11 +879,15 @@ struct DynamicMerge ...@@ -733,11 +879,15 @@ struct DynamicMerge
do_borrow = idx_low_tmp < 0; do_borrow = idx_low_tmp < 0;
#if 0
// TODO: use exec-mask inline asm // TODO: use exec-mask inline asm
if(do_borrow) if(do_borrow)
{ {
idx_diff_low(i) += low_lengths_[i]; idx_diff_low(i) += low_lengths_[i];
} }
#else
idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i]; idx_low(i) += idx_diff_low[i];
}); });
...@@ -765,8 +915,10 @@ struct DynamicMerge ...@@ -765,8 +915,10 @@ struct DynamicMerge
const UpIdx& idx_up_new, const UpIdx& idx_up_new,
Number<Hack>) const Number<Hack>) const
{ {
#if 1 #if 0
UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{}); UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#elif 0
UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#else #else
UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{}); UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#endif #endif
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "common_header.hpp" #include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp" #include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp" #include "dynamic_tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp" #include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp" #include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "ConstantMatrixDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp"
...@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// define input tensor descriptor for threadwise copy // define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy // thread input tensor, src of threadwise copy
#if 0 // debug
constexpr auto c_m0_m1_n0_n1_thread_desc = constexpr auto c_m0_m1_n0_n1_thread_desc =
make_dynamic_naive_tensor_descriptor_packed<4>( make_dynamic_naive_tensor_descriptor_packed<4>(
make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread)); make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread));
#else
constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>{});
#endif
// calculate origin of thread input tensor on global memory // calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index // blockwise GEMM c matrix starting index
...@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const index_t n_thread_data_on_global = const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col; n_block_data_on_global + c_thread_mtx_on_block.col;
#if 0
ThreadwiseDynamicTensorSliceTransfer_v1r2< ThreadwiseDynamicTensorSliceTransfer_v1r2<
AccFloat, AccFloat,
Float, Float,
...@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1 ...@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
n_thread_data_on_global % N1)) n_thread_data_on_global % N1))
.Run_hack( .Run_hack(
c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global); c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#else
ThreadwiseDynamicTensorSliceTransfer_v1r3<
AccFloat,
Float,
decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc),
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true,
true>(c_m0_m1_n0_n1_global_desc,
make_multi_index(m_thread_data_on_global / M1,
m_thread_data_on_global % M1,
n_thread_data_on_global / N1,
n_thread_data_on_global % N1))
.Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#endif
} }
} }
......
...@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata, ...@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
#endif #endif
// buffer_load requires: // buffer_load requires:
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr // 1) p_src_wave must be in global memory space
// 2) p_src_thread to be a wavewise pointer. // 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __device__ typename vector_type<T, VectorSize>::MemoryType
...@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread, ...@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool dst_thread_data_valid, bool dst_thread_data_valid,
index_t dst_data_range); index_t dst_data_range);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void
amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
T* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range);
// buffer_atomic requires: // buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer. // 2) p_dst_thread to be a wavewise pointer.
...@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread, ...@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif #endif
} }
template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD #if CK_USE_AMD_BUFFER_ATOMIC_FADD
template <> template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread, __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
......
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
#endif #endif
// workaround: put all workaround here // workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic // workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564 #ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1 #define CK_WORKAROUND_SWDEV_229564 1
#endif #endif
...@@ -123,6 +123,8 @@ using index_t = uint32_t; ...@@ -123,6 +123,8 @@ using index_t = uint32_t;
using index_t = int32_t; using index_t = int32_t;
#endif #endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic // int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
......
...@@ -32,7 +32,8 @@ struct static_for ...@@ -32,7 +32,8 @@ struct static_for
static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0, static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd), static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
"wrongs! should have NBegin <= NEnd"); "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
"NBegin >= NEnd)");
} }
template <class F> template <class F>
......
...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2 // b thread copy 2x2
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 2;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>; using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>;
...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
...@@ -22,7 +22,22 @@ int main(int argc, char* argv[]) ...@@ -22,7 +22,22 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
// 3x3, 36x36, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 37;
constexpr index_t WI = 37;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 35x35, stride 2 // 3x3, 35x35, stride 2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment