Commit 415a4a5b authored by Chao Liu's avatar Chao Liu
Browse files

updating v5r1

parent 32d485dd
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
namespace ck { namespace ck {
#if 0
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N] // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread // A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced: // If following number are power of 2, index calculation shall be greatly reduced:
...@@ -364,12 +365,20 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v1 ...@@ -364,12 +365,20 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v1
#endif #endif
} }
}; };
#endif
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N] // C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread // A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced: // Assume:
// MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster, // 1. A:
// MLevel1ThreadCluster, NLevel1ThreadCluster // 1. BlockMatrixA is known at compile-time
// 2. ABlockBuffer is DynamicBuffer
// 2. B:
// 1. BlockMatrixA is known at compile-time
// 2. BBlockBuffer is DynamicBuffer
// 3. C:
// 1. ThreadMatrixC is known at compile-time
// 2. CThreadBuffer is StaticBuffer
template <index_t BlockSize, template <index_t BlockSize,
typename FloatA, typename FloatA,
typename FloatB, typename FloatB,
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
namespace ck { namespace ck {
#if 0
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N] // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread // A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced: // If following number are power of 2, index calculation shall be greatly reduced:
...@@ -199,6 +200,181 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -199,6 +200,181 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
Run_naive(p_a_block, p_b_thread, p_c_thread); Run_naive(p_a_block, p_b_thread, p_c_thread);
} }
}; };
#else
// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
// KPerThread, HPerThread, MLevel0ThreadCluster, NLevel0ThreadCluster,
// MLevel1ThreadCluster, NLevel1ThreadCluster
template <index_t BlockSize,
typename FloatA,
typename FloatB,
typename FloatC,
typename BlockMatrixA,
typename BlockMatrixB,
typename ThreadMatrixC,
index_t KPerThread,
index_t HPerThread,
index_t WPerThread,
index_t EPerThreadLoop,
index_t ThreadGemmADataPerRead_K,
index_t ThreadGemmBDataPerRead_W>
struct BlockwiseGemm_km_kn_m0m1n0n1_v3
{
struct MatrixIndex
{
index_t k;
index_t h;
index_t w;
};
MatrixIndex c_thread_begin_mtx_idx_;
// HACK: fix this @Jing Zhang
static constexpr index_t KPerThreadSubC = 4;
static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
using AThreadCopy =
ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
FloatA,
BlockMatrixA,
decltype(a_thread_mtx_),
Sequence<EPerThreadLoop, KPerThreadSubC>,
Sequence<0, 1>,
1,
ThreadGemmADataPerRead_K,
AddressSpace::Generic,
AddressSpace::Vgpr,
1>;
AThreadCopy a_thread_copy_;
__device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
: c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
{
static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
BlockMatrixB::IsKnownAtCompileTime() &&
ThreadMatrixC::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
"wrong! K dimension not consistent\n");
constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
constexpr index_t N = BlockMatrixB{}.GetLength(I1);
constexpr index_t H = BlockMatrixB{}.GetLength(I2);
constexpr index_t W = BlockMatrixB{}.GetLength(I3);
static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
"wrong! Cannot evenly divide work among\n");
constexpr auto KThreadCluster = K / KPerThread;
constexpr auto HThreadCluster = H / HPerThread;
constexpr auto WThreadCluster = W / WPerThread;
static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
"wrong! wrong blocksize\n");
}
__device__ static constexpr auto GetThreadMatrixCLengths()
{
return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
}
__device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
{
constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
constexpr auto num_w_threads = W / WPerThread;
constexpr auto num_h_threads = H / HPerThread;
constexpr auto num_hw_threads = num_w_threads * num_h_threads;
index_t k_thread_id = thread_id / num_hw_threads;
index_t hw_thread_id = thread_id % num_hw_threads;
index_t h_thread_id = hw_thread_id / num_w_threads;
index_t w_thread_id = hw_thread_id % num_w_threads;
return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
}
__device__ void Run(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
{
auto a_block_buf = make_dynamic_buffer(p_a_block);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
// HACK: fix this @Jing Zhang
constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HoPerThreadSubC == 0, "");
static_assert(WPerThread % WoPerThreadSubC == 0, "");
// thread A, B for GEMM
FloatA p_a_thread[a_thread_mtx_.GetElementSpaceSize()];
auto a_thread_buf = make_dynamic_buffer(p_a_thread);
auto b_thread_buf = make_dynamic_buffer(p_b_thread);
auto c_thread_buf = make_dynamic_buffer(p_c_thread);
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<FloatA,
FloatB,
FloatC,
decltype(a_thread_mtx_),
decltype(b_thread_mtx_),
decltype(c_thread_mtx_),
HoPerThreadSubC,
WoPerThreadSubC>{};
static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
a_thread_copy_.Run(a_block_mtx,
make_tuple(e_begin, k_begin),
a_block_buf,
a_thread_mtx_,
make_tuple(I0, I0),
a_thread_buf);
static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
threadwise_gemm.Run(a_thread_buf,
make_tuple(I0, I0),
b_thread_buf,
make_tuple(e_begin, I0, h_begin, w_begin),
c_thread_buf,
make_tuple(k_begin, I0, h_begin, w_begin));
});
});
});
});
}
};
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -28,7 +28,10 @@ struct ThreadwiseDynamicTensorSliceSet_v1 ...@@ -28,7 +28,10 @@ struct ThreadwiseDynamicTensorSliceSet_v1
static_assert(Desc::IsKnownAtCompileTime(), static_assert(Desc::IsKnownAtCompileTime(),
"wrong! SrcDesc and DstDesc need to known at compile-time"); "wrong! SrcDesc and DstDesc need to known at compile-time");
#if 0
// TODO: turn this on when v5r1 is update to get rid of array
static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
#endif
static_assert(is_known_at_compile_time<remove_cv_t<remove_reference_t<OriginIdx>>>::value, static_assert(is_known_at_compile_time<remove_cv_t<remove_reference_t<OriginIdx>>>::value,
"wrong! OriginIdx need to be known at compile-time"); "wrong! OriginIdx need to be known at compile-time");
......
...@@ -1399,7 +1399,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 ...@@ -1399,7 +1399,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
remove_cv_t<remove_reference_t<DstData>>>::value, remove_cv_t<remove_reference_t<DstData>>>::value,
"wrong! SrcBuffer or DstBuffer data type is wrong"); "wrong! SrcBuffer or DstBuffer data type is wrong");
#if 0
// turn this on after v5r1 is updated
static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
#endif
static_assert(is_known_at_compile_time< static_assert(is_known_at_compile_time<
remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value && remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
...@@ -1413,8 +1416,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 ...@@ -1413,8 +1416,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{}; constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
// SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
constexpr auto src_ref_to_origin_disp_idx = SrcRefToOriginDisplacement{}; constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
constexpr auto dst_ref_to_origin_disp_idx = DstRefToOriginDisplacement{}; constexpr auto dst_ref_to_origin_disp_idx = to_multi_index(DstRefToOriginDisplacement{});
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -1469,7 +1472,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 ...@@ -1469,7 +1472,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
// src coordinate // src coordinate
constexpr auto src_ref_to_data_disp_idx = constexpr auto src_ref_to_data_disp_idx =
to_multi_index(src_ref_to_origin_disp_idx + data_to_origin_disp_idx); src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
constexpr auto src_ref_to_data_disp_coord_iterator = constexpr auto src_ref_to_data_disp_coord_iterator =
make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx); make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace ck { namespace ck {
#if 1 #if 0
template <typename Float, typename Desc> template <typename Float, typename Desc>
__device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread) __device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread)
{ {
...@@ -174,8 +174,7 @@ struct ThreadwiseGemm_km_kn_mn_v1 ...@@ -174,8 +174,7 @@ struct ThreadwiseGemm_km_kn_mn_v1
// Element of matrix can be vectorized data // Element of matrix can be vectorized data
// Assume: // Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time // 1. ADesc, BDesc, CDesc are known at compile-time
// 2. ABuffer, BBuffer, CBuffer are static buffer // 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
// 3. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template <typename FloatA, template <typename FloatA,
typename FloatB, typename FloatB,
typename FloatC, typename FloatC,
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
namespace ck { namespace ck {
#if 0
template <typename Float, typename Desc> template <typename Float, typename Desc>
__device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread) __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread)
{ {
...@@ -31,10 +32,15 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread ...@@ -31,10 +32,15 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
}); });
}); });
} }
#endif
#if 0
// C[M, N] += transpose(A[K, M]) * B[K, N] // C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data // Element of matrix can be vectorized data
template <typename ADesc, template <typename FloatA,
typename FloatB,
typename FloatC,
typename ADesc,
typename BDesc, typename BDesc,
typename CDesc, typename CDesc,
index_t H, index_t H,
...@@ -44,8 +50,7 @@ template <typename ADesc, ...@@ -44,8 +50,7 @@ template <typename ADesc,
bool>::type = false> bool>::type = false>
struct ThreadwiseGemm_km_kn_mn_v3 struct ThreadwiseGemm_km_kn_mn_v3
{ {
template <typename FloatA, typename FloatB, typename FloatC> __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
__device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{ {
static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() && static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(), CDesc::IsKnownAtCompileTime(),
...@@ -132,13 +137,153 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -132,13 +137,153 @@ struct ThreadwiseGemm_km_kn_mn_v3
}); });
}); });
} }
};
template <typename FloatA, typename FloatB, typename FloatC> #else
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) // C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data
// Assume:
// 1. ADesc, BDesc, CDesc are known at compile-time
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
template <typename FloatA,
typename FloatB,
typename FloatC,
typename ADesc,
typename BDesc,
typename CDesc,
index_t H,
index_t W,
typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(),
bool>::type = false>
struct ThreadwiseGemm_km_kn_mn_v3
{
template <typename ABuffer,
typename AOriginIdx,
typename BBuffer,
typename BOriginIdx,
typename CBuffer,
typename COriginIdx>
__device__ static void Run(const ABuffer& a_buf,
AOriginIdx,
const BBuffer& b_buf,
BOriginIdx,
CBuffer& c_buf,
COriginIdx)
{ {
Run_source(p_a, p_b, p_c); static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
CDesc::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
static_assert(
is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
"wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto E = ADesc{}.GetLength(I0);
constexpr auto K = ADesc{}.GetLength(I1);
constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
static_for<0, E, 1>{}([&](auto e) {
static_for<0, K, 1>{}([&](auto k) {
constexpr index_t a_offset =
ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
if constexpr(H == 2 && W == 2)
{
constexpr index_t b_offset_0 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
constexpr index_t b_offset_2 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_3 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
constexpr index_t c_offset_0 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
constexpr index_t c_offset_2 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_3 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}],
b_buf[Number<b_offset_1>{}],
b_buf[Number<b_offset_2>{}],
b_buf[Number<b_offset_3>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}),
c_buf(Number<c_offset_2>{}),
c_buf(Number<c_offset_3>{}));
}
else if constexpr(H == 4 && W == 1)
{
constexpr index_t b_offset_0 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
constexpr index_t b_offset_1 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
constexpr index_t b_offset_2 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
constexpr index_t b_offset_3 =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
constexpr index_t c_offset_0 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
constexpr index_t c_offset_1 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
constexpr index_t c_offset_2 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
constexpr index_t c_offset_3 =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset_0>{}],
b_buf[Number<b_offset_1>{}],
b_buf[Number<b_offset_2>{}],
b_buf[Number<b_offset_3>{}],
c_buf(Number<c_offset_0>{}),
c_buf(Number<c_offset_1>{}),
c_buf(Number<c_offset_2>{}),
c_buf(Number<c_offset_3>{}));
}
else
{
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
constexpr index_t b_offset =
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
constexpr index_t c_offset =
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
#if 0
c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}]);
#else
amd_assembly_inner_product(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}],
c_buf(Number<c_offset>{}));
#endif
});
});
}
});
});
} }
}; };
#endif
} // namespace ck } // namespace ck
#endif #endif
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#if 1 #if 0
#define CK_AMD_GPU_GFX906 1 #define CK_AMD_GPU_GFX906 1
#elif 0 #elif 0
#define CK_AMD_GPU_GFX908 1 #define CK_AMD_GPU_GFX908 1
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#endif #endif
// launch bounds // launch bounds
#define CK_USE_LAUNCH_BOUNDS 0 #define CK_USE_LAUNCH_BOUNDS 1
#ifdef CK_USE_LAUNCH_BOUNDS #ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256 #define CK_MAX_THREAD_PER_BLOCK 256
......
...@@ -64,7 +64,7 @@ int main(int argc, char* argv[]) ...@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t HI = 1080; constexpr index_t HI = 1080;
...@@ -630,7 +630,7 @@ int main(int argc, char* argv[]) ...@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{})); print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{})); print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 1 #if 0
using in_data_t = float; using in_data_t = float;
constexpr index_t in_vector_size = 1; constexpr index_t in_vector_size = 1;
using acc_data_t = float; using acc_data_t = float;
...@@ -724,7 +724,7 @@ int main(int argc, char* argv[]) ...@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t, device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
in_vector_size, in_vector_size,
acc_data_t, acc_data_t,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment