Commit 23f633cd authored by Chao Liu's avatar Chao Liu
Browse files

clean up for miopen

parent 3c0646d4
...@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{}; constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
...@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0);
constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
constexpr index_t Hi = in_n_c_h_w_global_desc.GetLength(I2);
constexpr index_t Wi = in_n_c_h_w_global_desc.GetLength(I3);
constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1);
constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2);
...@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
{ {
constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K2 = GemmMPerThreadSubC;
constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster; constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;
constexpr index_t K0 = K / (K1 * K2);
// define tensor descriptor for threadwise copy // define tensor descriptor for threadwise copy
// output memory layout descriptor in register // output memory layout descriptor in register
......
...@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto ...@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto
template <class TDesc> template <class TDesc>
__host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s) __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
{ {
const auto desc = TDesc{}; printf(
"%s NRow %u NCol %u RowStride %u\n", s, TDesc::NRow(), TDesc::NCol(), TDesc::RowStride());
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
} }
} // namespace ck } // namespace ck
......
...@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr index_t M = BlockMatrixA::NCol(); // A is transposed constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
constexpr index_t N = BlockMatrixB::NCol(); constexpr index_t N = BlockMatrixB::NCol();
constexpr index_t K = BlockMatrixA::NRow();
static_assert(M % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 && static_assert(M % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 &&
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
...@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{ {
constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t MPerThread = c_thread_mtx.NRow();
constexpr index_t NPerThread = c_thread_mtx.NCol();
constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
...@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
const FloatB* __restrict__ p_b_block, const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread) const FloatC* __restrict__ p_c_thread) const
{ {
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr auto c_thread_mtx = ThreadMatrixC{};
...@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr auto b_thread_mtx = constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{}); make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
...@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
using Float4 = vector_type<float, 4>::MemoryType; using Float4 = vector_type<float, 4>::MemoryType;
Float4* reg_a = (Float4*)(p_a_thread); Float4* reg_a = reinterpret_cast<Float4*>(p_a_thread);
Float4* reg_b = (Float4*)(p_b_thread); Float4* reg_b = reinterpret_cast<Float4*>(p_b_thread);
Float4* reg_c = (Float4*)(p_c_thread); Float4* reg_c = reinterpret_cast<Float4*>(p_c_thread);
reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]); reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]); reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
...@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t M = a_block_mtx.NCol();
constexpr index_t N = b_block_mtx.NCol();
constexpr index_t K = a_block_mtx.NRow(); constexpr index_t K = a_block_mtx.NRow();
constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow();
...@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr index_t MRepeat = MPerThread / MPerThreadSubC; constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC; constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
const FloatA* const p_a_block_thread_offset = p_a_block + mMyThreadOffsetA;
#pragma unroll #pragma unroll
// loop over k // loop over k
for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop) for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
...@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t M = a_block_mtx.NCol();
constexpr index_t N = b_block_mtx.NCol();
constexpr index_t K = a_block_mtx.NRow(); constexpr index_t K = a_block_mtx.NRow();
constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow();
......
...@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
"wrong! cannot evenly divide sliced tensor into cluster"); "wrong! cannot evenly divide sliced tensor into cluster");
}); });
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// for now, only support SubLengths == 1 on a merged dimension that constains // for now, only support SubLengths == 1 on a merged dimension that constains
// multiple original dimensions // multiple original dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) { static_for<0, nDim, 1>{}([&](auto IDim_) {
...@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr auto IDim = Number<IDim_>{}; constexpr auto IDim = Number<IDim_>{};
constexpr index_t idim = IDim; constexpr index_t idim = IDim;
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) { static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
// logic for a merged dimension, also works for non-merged dimension, but its logic may // logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for // be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension // a non-merged dimension
...@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
}).Else([&](auto fwd) { }).Else([&](auto) {
// Logic for non-merged dimension. If you are never going to move the slicing window on // Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler // which are being calculated here, will never be used later. In this case, compiler
......
...@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA, ...@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
integral_constant<bool, TransC>, integral_constant<bool, TransC>,
FloatC* __restrict__ p_c_thread) FloatC* __restrict__ p_c_thread)
{ {
static_if<TransA && (!TransB) && (!TransC)>{}([&](auto fwd) { static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
constexpr auto a_mtx = MatrixA{}; constexpr auto a_mtx = MatrixA{};
constexpr auto b_mtx = MatrixB{}; constexpr auto b_mtx = MatrixB{};
constexpr auto c_mtx = MatrixC{}; constexpr auto c_mtx = MatrixC{};
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
namespace ck { namespace ck {
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
template <class Float, template <class Float,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
......
...@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>) ...@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
} }
template <index_t... Xs, index_t... Ys> template <index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys...> seq_y) __host__ __device__ constexpr auto operator-(Sequence<Xs...>, Sequence<Ys...>)
{ {
static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
......
...@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a, ...@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a,
const vector_type<float, 4>::MemoryType& b, const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c) vector_type<float, 4>::MemoryType& c)
{ {
outerProduct1x4(&a, (float*)&b, (float*)&c); outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
} }
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a, __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
......
...@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4))); ...@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t; using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{ {
d += s0 * s1; d += s0 * s1;
......
...@@ -23,6 +23,10 @@ using float4_t = float4; ...@@ -23,6 +23,10 @@ using float4_t = float4;
using index_t = uint32_t; using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{ {
d += s0 * s1; d += s0 * s1;
......
...@@ -18,7 +18,7 @@ struct forwarder ...@@ -18,7 +18,7 @@ struct forwarder
struct swallow struct swallow
{ {
template <class... Ts> template <class... Ts>
__host__ __device__ constexpr swallow(Ts&&... ts) __host__ __device__ constexpr swallow(Ts&&...)
{ {
} }
}; };
......
...@@ -6,10 +6,6 @@ ...@@ -6,10 +6,6 @@
namespace ck { namespace ck {
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class X, class Y> template <class X, class Y>
using is_same = std::is_same<X, Y>; using is_same = std::is_same<X, Y>;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment