Unverified Commit 01055d95 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

No raw index calculation (#31)



* Replace most raw index calculation to coordinate transformation
* Overhaul blockwise and threadwise GEMM
* Overhaul driver for gridwies GEMM kernel
Co-authored-by: default avatarJing Zhang <jizhan@amd.com>
parent d075adf1
...@@ -29,13 +29,17 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -29,13 +29,17 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
{ {
using namespace ck; using namespace ck;
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk" std::cout << __func__ << std::endl;
<< std::endl;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
constexpr auto I7 = Number<7>{};
constexpr auto I8 = Number<8>{};
constexpr auto N = OutDesc::GetLengths()[I0]; constexpr auto N = OutDesc::GetLengths()[I0];
constexpr auto K = OutDesc::GetLengths()[I1]; constexpr auto K = OutDesc::GetLengths()[I1];
...@@ -53,7 +57,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -53,7 +57,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr auto C0 = C / Number<InWeiVectorSize>{}; constexpr auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto C1 = Number<InWeiVectorSize>{}; constexpr auto C1 = Number<InWeiVectorSize>{};
#if 0 #if 1
// run-time variables // run-time variables
constexpr auto in_n_hi_wi_c0_desc = constexpr auto in_n_hi_wi_c0_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0)); make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
...@@ -112,7 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -112,7 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data()); wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data()); out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
#if 1 #if 0
// cdata = 16, BlockSize = 64, 16x64x4 // cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
...@@ -372,51 +376,92 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( ...@@ -372,51 +376,92 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
#endif #endif
constexpr auto conv_driver = constexpr index_t GemmM1 = GemmMPerThread * GemmMLevel0Cluster * GemmMLevel1Cluster;
constexpr index_t GemmN1 = GemmNPerThread * GemmNLevel0Cluster * GemmNLevel1Cluster;
const auto descs =
#if 1 #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad
#elif 0 #else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_no_pad transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
#endif #endif
<BlockSize, <GemmMPerBlock, GemmNPerBlock, GemmM1, GemmN1>(wei_k_y_x_c0_desc,
typename vector_type<TInWei, InWeiVectorSize>::type, in_n_hi_wi_c0_desc,
TAcc, out_n_ho_wo_k_desc,
TOut, conv_strides,
GemmMPerBlock, conv_dilations,
GemmNPerBlock, in_left_pads,
GemmKPerBlock, in_right_pads);
GemmMPerThread,
GemmNPerThread, for(index_t i = 0; i < 5; ++i)
GemmKPerThread, {
GemmMLevel0Cluster, float ave_time = launch_kernel_dynamic_gemm_v1<
GemmNLevel0Cluster, BlockSize,
GemmMLevel1Cluster, typename vector_type<TInWei, InWeiVectorSize>::type,
GemmNLevel1Cluster, TAcc,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM, TOut,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM, InMemoryDataOperation::Set,
GemmABlockTransferSrcScalarPerVector_GemmK, decltype(descs[I0]),
GemmABlockTransferDstScalarPerVector_GemmM, decltype(descs[I1]),
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN, decltype(descs[I2]),
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN, decltype(descs[I3]),
GemmBBlockTransferSrcScalarPerVector_GemmK, GemmMPerBlock,
GemmBBlockTransferDstScalarPerVector_GemmN, GemmNPerBlock,
GemmCThreadTransferDstScalarPerVector_GemmM1>{}; GemmKPerBlock,
GemmMPerThread,
conv_driver.Run(wei_k_y_x_c0_desc, GemmNPerThread,
in_n_hi_wi_c0_desc, GemmKPerThread,
out_n_ho_wo_k_desc, GemmMLevel0Cluster,
conv_strides, GemmNLevel0Cluster,
conv_dilations, GemmMLevel1Cluster,
in_left_pads, GemmNLevel1Cluster,
in_right_pads, GemmABlockTransferThreadSliceLengths_GemmK_GemmM,
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( GemmABlockTransferThreadClusterLengths_GemmK_GemmM,
wei_k_y_x_c_device_buf.GetDeviceBuffer()), Sequence<1, 0>,
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( Sequence<1, 0>,
in_n_hi_wi_c_device_buf.GetDeviceBuffer()), 0,
static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer())); GemmABlockTransferSrcScalarPerVector_GemmK,
GemmABlockTransferDstScalarPerVector_GemmM,
false, // don't move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
Sequence<1, 0>,
Sequence<1, 0>,
0,
GemmBBlockTransferSrcScalarPerVector_GemmK,
GemmBBlockTransferDstScalarPerVector_GemmN,
false, // don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence<2, 3, 0, 1>,
1,
GemmCThreadTransferDstScalarPerVector_GemmM1,
decltype(descs[I4]),
decltype(descs[I5]),
decltype(descs[I6]),
decltype(descs[I7]),
decltype(descs[I8])>(static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
wei_k_y_x_c_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
descs[I0],
descs[I1],
descs[I2],
descs[I3],
descs[I4],
descs[I5],
descs[I6],
descs[I7],
descs[I8],
nrepeat);
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
}
// copy result back to host
out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data()); out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
auto f_nhwk2nkhw = [&](auto n, auto k, auto ho, auto wo) { auto f_nhwk2nkhw = [&](auto n, auto k, auto ho, auto wo) {
......
...@@ -48,8 +48,8 @@ int main(int argc, char* argv[]) ...@@ -48,8 +48,8 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 0
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
...@@ -62,9 +62,9 @@ int main(int argc, char* argv[]) ...@@ -62,9 +62,9 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t HI = 1080; constexpr index_t HI = 1080;
...@@ -92,7 +92,7 @@ int main(int argc, char* argv[]) ...@@ -92,7 +92,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 16; constexpr index_t C = 16;
constexpr index_t HI = 540; constexpr index_t HI = 540;
...@@ -210,7 +210,7 @@ int main(int argc, char* argv[]) ...@@ -210,7 +210,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 1
// 3x3, 71x71 // 3x3, 71x71
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -225,7 +225,7 @@ int main(int argc, char* argv[]) ...@@ -225,7 +225,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 1 #elif 0
// 7x1, 17x17 // 7x1, 17x17
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
......
WORKSPACE=$1
echo "workspace: " $WORKSPACE
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v $WORKSPACE:/root/workspace \
asroy/tensorflow:rocm3.7-tf2.3-dev-omp \
/bin/bash
#--network host \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment