Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen

Refactor for MIOpen integration (#4)
Refactor, so can bring multi-index transformation and padding support into MIOpen
52c3fe05 · Chao Liu · GitHub · 9aaeacc8 · 52c3fe05 · 52c3fe05
Unverified Commit 52c3fe05 authored Oct 11, 2019 by Chao Liu Committed by GitHub Oct 11, 2019
16 changed files
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -5,14 +5,12 @@
 #include "utility.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
+#include "float_type.hpp"
 #include "type.hpp"
 #include "tuple.hpp"
 #include "math.hpp"
-#include "vector_type.hpp"
 #include "sequence.hpp"
-#include "sequence_helper.hpp"
 #include "array.hpp"
-#include "array_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
@@ -22,8 +20,8 @@
 #include "amd_inline_asm.hpp"
 #endif

-#if CK_USE_AMD_INTRINSIC
-#include "amd_intrinsic.hpp"
+#if CK_USE_AMD_BUFFER_ADDRESSING
+#include "amd_buffer_addressing.hpp"
 #endif

 #endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -3,23 +3,58 @@

 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#include "bfloat16_dev.hpp"

+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+
+// device backend
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 1
+
+// AMD inline asm
+#ifndef CK_USE_AMD_INLINE_ASM
 #define CK_USE_AMD_INLINE_ASM 1
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#endif
+
+#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+
+// AMD buffer addressing
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING
+#define CK_USE_AMD_BUFFER_ADDRESSING 1
+#endif
+
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
+#endif
+
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 1
+#endif
+
+#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
+#endif
+
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0

+// workaround
+#define CK_WORKAROUND_SWDEV_202749 1
+
 namespace ck {

-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
-    global  = 3
+    generic,
+    global
 };

 #if CK_UNSIGNED_INDEX_TYPE
@@ -28,24 +63,8 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif

-// For some reason, HIP compiler need this definition to generate optimal load and store
-// instruction
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-
+// int32x4_t use by buffer_load and buffer_store llvm intrinsic
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));

-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
-
 } // namespace ck
-
 #endif
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -6,21 +6,33 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"

+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+
+// device backend
 #define CK_DEVICE_BACKEND_NVIDIA 1
-#define CK_USE_AMD_INTRINSIC 0
+
+// disable AMD inline asm and intrinsic
 #define CK_USE_AMD_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_BUFFER_ADDRESSING 0
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
+#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
+
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0

 namespace ck {

-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
+    generic,
    global = generic
 };

@@ -30,24 +42,5 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif

-// For some reason, CUDA need this definition, otherwise
-//   compiler won't generate optimal load and store instruction, and
-//   kernel would produce wrong result, indicating the compiler fail to generate correct
-//   instruction,
-using float2_t = float2;
-using float4_t = float4;
-
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
-
 } // namespace ck
-
 #endif
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
-#ifndef CK_ARRAY_HELPER_HPP
-#define CK_ARRAY_HELPER_HPP
+#ifndef CK_PRINT_ARRAY_HPP
+#define CK_PRINT_ARRAY_HPP

 #include "array.hpp"


--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
-#ifndef CK_SEQUENCE_HELPER_HPP
-#define CK_SEQUENCE_HELPER_HPP
+#ifndef CK_PRINT_SEQUENCE_HPP
+#define CK_PRINT_SEQUENCE_HPP

 #include "sequence.hpp"


--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP

-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"

 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"

 // this is ugly, only for 4d
 template <class TConstTensorDesc>

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
--- a/external/include/bfloat16_dev.hpp
+++ b/external/include/bfloat16_dev.hpp
--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
 #!/bin/bash
+ export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
 export KMDUMPISA=1
 export KMDUMPLLVM=1
-#export KMOPTLLC="-mattr=+enable-ds128"
- export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
+ export KMDUMPDIR=$PWD

-make -j driver
-/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
+ make -j driver
+#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm