Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen

Refactor for MIOpen integration (#4)
Refactor, so can bring multi-index transformation and padding support into MIOpen
52c3fe05 · Chao Liu · GitHub · 9aaeacc8 · 52c3fe05 · 52c3fe05
Unverified Commit 52c3fe05 authored Oct 11, 2019 by Chao Liu Committed by GitHub Oct 11, 2019
16 changed files
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -5,14 +5,12 @@
 #include "utility.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
+#include "float_type.hpp"
 #include "type.hpp"
 #include "tuple.hpp"
 #include "math.hpp"
-#include "vector_type.hpp"
 #include "sequence.hpp"
-#include "sequence_helper.hpp"
 #include "array.hpp"
-#include "array_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
@@ -22,8 +20,8 @@
 #include "amd_inline_asm.hpp"
 #endif
-#if CK_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
-#include "amd_intrinsic.hpp"
+#include "amd_buffer_addressing.hpp"
 #endif
 #endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -3,23 +3,58 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#include "bfloat16_dev.hpp"
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+// device backend
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 1
+// AMD inline asm
+#ifndef CK_USE_AMD_INLINE_ASM
 #define CK_USE_AMD_INLINE_ASM 1
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
+#endif
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+// AMD buffer addressing
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING
+#define CK_USE_AMD_BUFFER_ADDRESSING 1
+#endif
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
+#endif
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 1
+#endif
+#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
+#endif
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+// workaround
+#define CK_WORKAROUND_SWDEV_202749 1
 namespace ck {
-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
+    generic,
-    global  = 3
+    global
 };
 #if CK_UNSIGNED_INDEX_TYPE
@@ -28,24 +63,8 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
-// For some reason, HIP compiler need this definition to generate optimal load and store
+// int32x4_t use by buffer_load and buffer_store llvm intrinsic
-// instruction
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -6,21 +6,33 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+// device backend
 #define CK_DEVICE_BACKEND_NVIDIA 1
-#define CK_USE_AMD_INTRINSIC 0
+// disable AMD inline asm and intrinsic
 #define CK_USE_AMD_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_BUFFER_ADDRESSING 0
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
+#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 namespace ck {
-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
+    generic,
    global = generic
 };
@@ -30,24 +42,5 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
-// For some reason, CUDA need this definition, otherwise
-//   compiler won't generate optimal load and store instruction, and
-//   kernel would produce wrong result, indicating the compiler fail to generate correct
-//   instruction,
-using float2_t = float2;
-using float4_t = float4;
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
-#ifndef CK_ARRAY_HELPER_HPP
+#ifndef CK_PRINT_ARRAY_HPP
-#define CK_ARRAY_HELPER_HPP
+#define CK_PRINT_ARRAY_HPP
 #include "array.hpp"

--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
-#ifndef CK_SEQUENCE_HELPER_HPP
+#ifndef CK_PRINT_SEQUENCE_HPP
-#define CK_SEQUENCE_HELPER_HPP
+#define CK_PRINT_SEQUENCE_HPP
 #include "sequence.hpp"

--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 // this is ugly, only for 4d
 template <class TConstTensorDesc>

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
--- a/external/include/bfloat16_dev.hpp
+++ b/external/include/bfloat16_dev.hpp
--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
 #!/bin/bash
+ export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
 export KMDUMPISA=1
 export KMDUMPLLVM=1
-#export KMOPTLLC="-mattr=+enable-ds128"
+ export KMDUMPDIR=$PWD
- export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
-make -j driver
+ make -j driver
-/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
+#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm