Commit 68ae0731 authored by Chao Liu's avatar Chao Liu
Browse files

experiment with hip compiler

parent 52ae168b
...@@ -11,6 +11,9 @@ cmake ...@@ -11,6 +11,9 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="HIP" \ -D DEVICE_BACKEND="HIP" \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \
-D CMAKE_CXX_FLAGS="-gline-tables-only" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
...@@ -593,9 +593,9 @@ int main(int argc, char* argv[]) ...@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
constexpr unsigned HPad = 0; constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0; constexpr unsigned WPad = 0;
#elif 1 #elif 1
// 1x1 filter, 14x14 image, C = 256 // 1x1 filter, 14x14 image, C = 512
constexpr unsigned N = 128; constexpr unsigned N = 128;
constexpr unsigned C = 256; constexpr unsigned C = 512;
constexpr unsigned HI = 14; constexpr unsigned HI = 14;
constexpr unsigned WI = 14; constexpr unsigned WI = 14;
constexpr unsigned K = 512; constexpr unsigned K = 512;
......
...@@ -435,12 +435,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -435,12 +435,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
#pragma unroll #pragma unroll
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
{ {
// read first batch of A, B // read first batch of A, B
// copy A-sub to form A // copy A-sub to form A
//#pragma unroll #pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
#if 0
threadwise_matrix_copy( threadwise_matrix_copy(
a_block_mtx, a_block_mtx,
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
...@@ -448,7 +447,137 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -448,7 +447,137 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths());
#else }
// copy B-sub to form B
#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{
threadwise_matrix_copy(
b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
mMyThreadOffsetB,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
}
// loop over batch
#pragma unroll
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
{
// do current batch of gemm
threadwise_gemm(a_thread_mtx,
True,
p_a_thread,
b_thread_mtx,
False,
p_b_thread,
c_thread_mtx,
False,
p_c_thread + ib * ThreadMatrixStrideC,
f_accum);
// read next batch of a, b
if(BlockMatrixStrideA != 0)
{
#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{
threadwise_matrix_copy(
a_block_mtx,
p_a_block +
a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
(ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA,
a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths());
}
}
if(BlockMatrixStrideB != 0)
{
#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{
threadwise_matrix_copy(
b_block_mtx,
p_b_block +
b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
(ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
}
}
}
// do last batch of gemm
threadwise_gemm(a_thread_mtx,
True,
p_a_thread,
b_thread_mtx,
False,
p_b_thread,
c_thread_mtx,
False,
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
f_accum);
}
}
// this version put copy and compute in same place, experimenting with compiler behaviour
template <class FloatA, class FloatB, class FloatC, class Accumulator>
__device__ void Run_v2(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread,
Accumulator f_accum) const
{
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
constexpr unsigned MPerThread = c_thread_mtx.NRow();
constexpr unsigned NPerThread = c_thread_mtx.NCol();
// thread A, B for GEMM
// A is transposed, b is not
constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
// loop over k
//#pragma unroll
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
{
// read first batch of A, B
// copy A-sub to form A
//#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
{ {
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
...@@ -459,22 +588,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -459,22 +588,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
mMyThreadOffsetA]; mMyThreadOffsetA];
} }
} }
#endif
} }
// copy B-sub to form B // copy B-sub to form B
//#pragma unroll //#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
#if 0
threadwise_matrix_copy(
b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
mMyThreadOffsetB,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
{ {
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
...@@ -485,26 +604,13 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -485,26 +604,13 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
mMyThreadOffsetB]; mMyThreadOffsetB];
} }
} }
#endif
} }
// loop over batch // loop over batch
//#pragma unroll //#pragma unroll
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
{ {
// do current batch of gemm // do current batch of gemm
#if 0
threadwise_gemm(a_thread_mtx,
True,
p_a_thread,
b_thread_mtx,
False,
p_b_thread,
c_thread_mtx,
False,
p_c_thread + ib * ThreadMatrixStrideC,
f_accum);
#else
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
{ {
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
...@@ -521,7 +627,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -521,7 +627,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
} }
} }
} }
#endif
// read next batch of a, b // read next batch of a, b
if(BlockMatrixStrideA != 0) if(BlockMatrixStrideA != 0)
...@@ -529,16 +634,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -529,16 +634,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
//#pragma unroll //#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
#if 0
threadwise_matrix_copy(
a_block_mtx,
p_a_block +
a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
(ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA,
a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
{ {
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
...@@ -550,7 +645,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -550,7 +645,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
(ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA]; (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
} }
} }
#endif
} }
} }
...@@ -559,16 +653,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -559,16 +653,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
//#pragma unroll //#pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
#if 0
threadwise_matrix_copy(
b_block_mtx,
p_b_block +
b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
(ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB,
b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths());
#else
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
{ {
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
...@@ -580,24 +664,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -580,24 +664,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
(ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB]; (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
} }
} }
#endif
} }
} }
} }
// do last batch of gemm // do last batch of gemm
#if 0
threadwise_gemm(a_thread_mtx,
True,
p_a_thread,
b_thread_mtx,
False,
p_b_thread,
c_thread_mtx,
False,
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
f_accum);
#else
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
{ {
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
...@@ -613,7 +684,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -613,7 +684,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
} }
} }
} }
#endif
} }
} }
......
...@@ -209,10 +209,15 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -209,10 +209,15 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
{ {
for(unsigned x = 0; x < X; ++x) for(unsigned x = 0; x < X; ++x)
{ {
blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), #if 1
p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0), blockwise_batch_gemm.Run
p_out_thread, #elif 0
[](auto& acc, const auto&& v) { acc += v; }); blockwise_batch_gemm.Run_v2
#endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
p_out_thread,
[](auto& acc, const auto&& v) { acc += v; });
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment