Ported xdlops kernels to debug bwdwrw fp32/fp16/bfp16 issue. Verified atleast fwd data fp32 works.

32850b93 · Wen-Heng (Jack) Chung · 583755a7 · 32850b93 · 32850b93 · 32850b93
Commit 32850b93 authored Oct 09, 2019 by Wen-Heng (Jack) Chung
17 changed files
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
 #ifndef CK_COMMON_HEADER_HPP
 #define CK_COMMON_HEADER_HPP
-#define MIOPEN_USE_FP16 1
+#define MIOPEN_USE_FP16 0
 #define MIOPEN_USE_BFP16 0
-#define MIOPEN_USE_FP32 0
+#define MIOPEN_USE_FP32 1
 #define __HIP_PLATFORM_HCC__ 1

--- a/composable_kernel/include/utility/config.hpp.bkup
+++ b/composable_kernel/include/utility/config.hpp.bkup
+#ifndef CK_CONFIG_AMD_HPP
+#define CK_CONFIG_AMD_HPP
+#if 0
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#endif 
+#include "bfloat16_dev.hpp"
+#define CK_DEVICE_BACKEND_AMD 1
+#define CK_USE_AMD_INLINE_ASM 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#ifndef CK_USE_INLINE_ASM_XDLOPS
+#define CK_USE_INLINE_ASM_XDLOPS 0
+#endif
+namespace ck {
+// float
+// For some reason, HIP compiler need this definition to generate optimal load and store
+// instruction
+typedef float float32_t __attribute__((ext_vector_type(32)));
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+// half
+typedef half2 half2_t;
+// index_t: used for index calculation
+using index_t = uint32_t;
+// data type conversion
+template <class T>
+struct type_convert
+{
+    template <class X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
--- a/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp