Unverified Commit 3835318c authored by zjing14's avatar zjing14 Committed by GitHub
Browse files

xdlops_v4r4_fwd fp32/fp16 (#34)



* create files for xdlops

* working on blockwise_gemm_xdlops

* add KReduction

* add m/n repeats

* add 2x2 pipeline

* added 128x128 wavegemm

* use StaticBuffer of vector_type

* break vector type to blk_size

* add kpack into xldops_gemm and blockwise_gemm

* abroadcast only

* add fp32 mfma instructions

* adding fp16 mfma

* pack half4_t

* rename kperwave to kpack

* add 32x32x8fp16

* add fp16 mfma

* clean code

* clean code

* V4r4 xdlops kpack (#35)

* add kpack with incorrect results

* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2

* add 1x1 kernel

* add gridwise_gemm_v2 - single_buffer

* enabled dwordx4 for fp16
Co-authored-by: default avatarChao Liu <chao.liu2@amd.com>

* refactor fwd-v4r4-xdlops

* add v4r4-nhwc-xdlop

* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop

* tweak scheduling in gridwise gemm

* add v4r3 with a single output copy

* init commit: output with slice win

* adding sliceWin

* add multiple repeats pattern

* starting adding bwd-v4r1-xdlops

* use tuple as SrcBuffer

* adding bwd-data v4r1 nhwc xdlops

* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()

* fix bug in host bwd-data conv

* initial implementation of bwd-data v4r1 nhwc xdlops

* add launch bound flags

* enable launch bound

* add m/nrepeat=4

* tweak bwd-data v4r1 nhwc xdlops

* added bwd-data v4r1 nhwc xlops with output A and weight B

* add fwd-v4r4 nhwc xdlops, A input, B weight, C output
Co-authored-by: default avatarChao Liu <chao.liu2@amd.com>
parent 1685048a
...@@ -101,6 +101,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1 ...@@ -101,6 +101,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
static constexpr auto I2 = Number<2>{}; static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{}; static constexpr auto I3 = Number<3>{};
// GM0 and GN0 need to known at compile-time
static constexpr auto GM0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I0); static constexpr auto GM0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I0);
static constexpr auto GN0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I2); static constexpr auto GN0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I2);
...@@ -140,7 +141,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1 ...@@ -140,7 +141,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
{ {
static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value && static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value, is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
"wrong!"); "wrong! GM0 and GN0 need to be known at compile-time");
const auto GM1 = a_gk_gm0_gm1_grid_desc.GetLength(I2); const auto GM1 = a_gk_gm0_gm1_grid_desc.GetLength(I2);
const auto GN1 = b_gk_gn0_gn1_grid_desc.GetLength(I2); const auto GN1 = b_gk_gn0_gn1_grid_desc.GetLength(I2);
......
...@@ -101,9 +101,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -101,9 +101,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer"); static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>, // static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
remove_cv_t<remove_reference_t<SrcData>>>::value, // remove_cv_t<remove_reference_t<SrcData>>>::value,
"wrong! SrcBuffer data type is wrong"); //"wrong! SrcBuffer data type is wrong");
// SrcDesc and src_slice_origin_idx are known at compile-time // SrcDesc and src_slice_origin_idx are known at compile-time
constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{}; constexpr auto src_desc = remove_cv_t<remove_reference_t<SrcDesc>>{};
...@@ -1407,7 +1407,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4 ...@@ -1407,7 +1407,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
constexpr auto data_to_origin_disp_idx = constexpr auto data_to_origin_disp_idx =
ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access; ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
#endif #endif
// src coordinate // src coordinate
constexpr auto src_ref_to_data_disp_idx = constexpr auto src_ref_to_data_disp_idx =
src_ref_to_origin_disp_idx + data_to_origin_disp_idx; src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
......
This diff is collapsed.
...@@ -268,6 +268,7 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -268,6 +268,7 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
#if 0
vector_type<half_t, 8> tmp; vector_type<half_t, 8> tmp;
tmp.AsType<half4_t>()(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4( tmp.AsType<half4_t>()(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
...@@ -280,6 +281,12 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -280,6 +281,12 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
0); 0);
return tmp.AsType<half8_t>()(Number<0>{}); return tmp.AsType<half8_t>()(Number<0>{});
#else
float4_t tmp = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
return as_type<half8_t>(tmp);
#endif
} }
} }
else if constexpr(is_same<T, int32_t>::value) else if constexpr(is_same<T, int32_t>::value)
......
This diff is collapsed.
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#define CK_AMD_GPU_GFX906 1 #define CK_AMD_GPU_GFX906 1
#elif 1 #elif 1
#define CK_AMD_GPU_GFX908 1 #define CK_AMD_GPU_GFX908 1
#elif 1 #elif 0
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 1
#endif #endif
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#endif #endif
// launch bounds // launch bounds
#define CK_USE_LAUNCH_BOUNDS 0 #define CK_USE_LAUNCH_BOUNDS 1
#ifdef CK_USE_LAUNCH_BOUNDS #ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256 #define CK_MAX_THREAD_PER_BLOCK 256
...@@ -116,7 +116,7 @@ ...@@ -116,7 +116,7 @@
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
// merge transformation use magic number division // merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
......
...@@ -174,8 +174,15 @@ __host__ __device__ constexpr auto container_reduce(const Container& x, ...@@ -174,8 +174,15 @@ __host__ __device__ constexpr auto container_reduce(const Container& x,
{ {
static_assert((IEnd - IBegin) % IStep == 0, "wrong!"); static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
if constexpr(IEnd > IBegin)
{
return container_reduce_impl( return container_reduce_impl(
x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{}); x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
}
else
{
return init;
}
} }
#endif #endif
......
This diff is collapsed.
...@@ -153,6 +153,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X ...@@ -153,6 +153,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
return *this; return *this;
} }
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
}; };
template <typename... Xs> template <typename... Xs>
......
...@@ -19,7 +19,22 @@ int main(int argc, char* argv[]) ...@@ -19,7 +19,22 @@ int main(int argc, char* argv[])
{ {
using namespace launcher; using namespace launcher;
#if 0 #if 1
// 1x1 filter, 14x14 image
constexpr index_t N = 1;
constexpr index_t C = 256;
constexpr index_t HI = 1;
constexpr index_t WI = 128;
constexpr index_t K = 16;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
constexpr index_t N = 64; constexpr index_t N = 64;
constexpr index_t C = 256; constexpr index_t C = 256;
constexpr index_t HI = 56; constexpr index_t HI = 56;
...@@ -93,7 +108,7 @@ int main(int argc, char* argv[]) ...@@ -93,7 +108,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 1x1 filter, 14x14 image // 1x1 filter, 14x14 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 512; constexpr index_t C = 512;
...@@ -153,7 +168,7 @@ int main(int argc, char* argv[]) ...@@ -153,7 +168,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<2, 2>; using LeftPads = Sequence<2, 2>;
using RightPads = Sequence<2, 2>; using RightPads = Sequence<2, 2>;
#elif 1 #elif 0
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
...@@ -245,7 +260,7 @@ int main(int argc, char* argv[]) ...@@ -245,7 +260,7 @@ int main(int argc, char* argv[])
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 1 #elif 0
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
#elif 1 #elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment