"vscode:/vscode.git/clone" did not exist on "1eaad6d1bb6e963f26b4179b32b90f6dba2106ad"
Unverified Commit 52c3fe05 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen
parent 9aaeacc8
......@@ -5,14 +5,12 @@
#include "utility.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "math.hpp"
#include "vector_type.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "array.hpp"
#include "array_helper.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
......@@ -22,8 +20,8 @@
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp"
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#endif
#endif
......@@ -3,23 +3,58 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// device backend
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif
#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
#endif
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 1
#endif
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
#endif
// experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
// workaround
#define CK_WORKAROUND_SWDEV_202749 1
namespace ck {
enum address_space_t
enum AddressSpace
{
generic = 0,
global = 3
generic,
global
};
#if CK_UNSIGNED_INDEX_TYPE
......@@ -28,24 +63,8 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(const X& x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
#endif
......@@ -6,22 +6,34 @@
#include "nvToolsExt.h"
#include "helper_cuda.h"
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// device backend
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0
// disable AMD inline asm and intrinsic
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_BUFFER_ADDRESSING 0
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
#define CK_USE_AMD_XDLOPS 0
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
// experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
namespace ck {
enum address_space_t
enum AddressSpace
{
generic = 0,
global = generic
generic,
global = generic
};
#if CK_UNSIGNED_INDEX_TYPE
......@@ -30,24 +42,5 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using float2_t = float2;
using float4_t = float4;
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(const X& x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
#endif
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA
// float
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float32_t __attribute__((ext_vector_type(32)));
// float16
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
// bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
template <class T, index_t N>
struct vector_type
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
struct vector_type<float, 1>
{
using MemoryType = float;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<float, 2>
{
using MemoryType = float2_t;
union DataType
{
MemoryType vector;
float scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<float, 4>
{
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 1>
{
using MemoryType = half;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 2>
{
using MemoryType = half2_t;
union DataType
{
MemoryType vector;
half scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using MemoryType = half4_t;
union DataType
{
MemoryType vector;
half scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
struct vector_type<ushort, 1>
{
using MemoryType = ushort;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
template <>
struct vector_type<ushort, 2>
{
using MemoryType = ushort2_t;
union DataType
{
MemoryType vector;
ushort scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<ushort, 4>
{
using MemoryType = ushort4_t;
union DataType
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
template <typename T>
struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(half4_t a, half4_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(ushort2_t a, ushort2_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(ushort4_t a, ushort4_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
};
} // namespace ck
#endif
#ifndef CK_VECTOR_TYPE_HPP
#define CK_VECTOR_TYPE_HPP
#ifndef CK_FLOAT_TYPE_NVIDIA_HPP
#define CK_FLOAT_TYPE_NVIDIA_HPP
#include "config.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
namespace ck {
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
// float
using float2_t = float2;
using float4_t = float4;
// float16
using half2_t = half2;
template <class T, index_t N>
struct vector_type
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
......@@ -29,7 +43,7 @@ struct vector_type<float, 2>
{
using MemoryType = float2_t;
union Data
union DataType
{
MemoryType vector;
float scalar[2];
......@@ -44,7 +58,7 @@ struct vector_type<float, 2>
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
Data data;
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
......@@ -56,6 +70,8 @@ struct vector_type<float, 4>
{
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
......@@ -65,23 +81,77 @@ struct vector_type<float, 4>
};
template <>
struct vector_type<const float, 1>
struct vector_type<half, 1>
{
using MemoryType = const float;
using MemoryType = half;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
};
template <>
struct vector_type<const float, 2>
struct vector_type<half, 2>
{
using MemoryType = const float2_t;
using MemoryType = half2_t;
union DataType
{
MemoryType vector;
half scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<const float, 4>
// data type conversion
template <typename T>
struct type_convert
{
using MemoryType = const float4_t;
template <typename X>
__device__ T operator()(const X& x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
template <typename T>
struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
};
} // namespace ck
#endif
#ifndef CK_ARRAY_HELPER_HPP
#define CK_ARRAY_HELPER_HPP
#ifndef CK_PRINT_ARRAY_HPP
#define CK_PRINT_ARRAY_HPP
#include "array.hpp"
......
#ifndef CK_SEQUENCE_HELPER_HPP
#define CK_SEQUENCE_HELPER_HPP
#ifndef CK_PRINT_SEQUENCE_HPP
#define CK_PRINT_SEQUENCE_HPP
#include "sequence.hpp"
......
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
......
......@@ -3,14 +3,17 @@
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "convolution_common.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations>
template <typename T,
typename InDesc,
typename WeiDesc,
typename OutDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
......@@ -19,6 +22,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
ck::index_t nrepeat)
{
using namespace ck;
......@@ -28,9 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr auto in_nchw_desc =
make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
constexpr auto wei_kcyx_desc =
make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
constexpr auto out_nkhw_desc =
make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t K = out_nkhw_desc.GetLength(I1);
......@@ -47,7 +55,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
// BlockSize = 256, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
......@@ -84,7 +92,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
// BlockSize = 64, each thread hold 64 data
constexpr index_t BlockSize = 64;
constexpr index_t BPerBlock = 8;
......@@ -120,7 +128,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 1
#elif 0
// BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
constexpr index_t BlockSize = 256;
......@@ -174,11 +182,15 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
GridSize,
BlockSize,
T,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
ConvolutionDirection::Forward,
BPerBlock,
KPerBlock,
EPerBlock,
......
......@@ -3,27 +3,23 @@
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
template <typename T,
typename InDesc,
typename WeiDesc,
typename OutDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
ck::index_t nrepeat)
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations>
void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
ck::index_t nrepeat)
{
using namespace ck;
......@@ -32,12 +28,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc =
make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
constexpr auto wei_kcyx_desc =
make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
constexpr auto out_nkhw_desc =
make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t K = out_nkhw_desc.GetLength(I1);
......@@ -54,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1
// BlockSize = 256, each thread hold 64 data
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
......@@ -91,7 +84,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0
// BlockSize = 64, each thread hold 64 data
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
constexpr index_t BlockSize = 64;
constexpr index_t BPerBlock = 8;
......@@ -127,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0
#elif 1
// BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
constexpr index_t BlockSize = 256;
......@@ -177,48 +170,44 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_conv =
#if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
#endif
<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
BPerBlock,
KPerBlock,
EPerBlock,
GemmNRepeat,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated<
GridSize,
BlockSize,
T,
T,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
ConvolutionDirection::Forward,
BPerBlock,
KPerBlock,
EPerBlock,
GemmNRepeat,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
for(index_t i = 0; i < nrepeat; ++i)
{
......
......@@ -5,14 +5,14 @@
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations>
class ConvDilations,
class LeftPads,
class RightPads>
void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
......@@ -21,8 +21,12 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
ck::index_t nrepeat)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
......@@ -164,7 +168,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr auto gridwise_conv =
#if 0
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
#else
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
#endif
......@@ -176,6 +180,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
BPerBlock,
KPerBlock,
EPerBlock,
......
......@@ -3,30 +3,26 @@
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
using namespace ck;
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class LeftPads,
class RightPads>
void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
ck::index_t nrepeat)
class ConvDilations>
void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
ck::index_t nrepeat)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
......@@ -168,9 +164,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
constexpr auto gridwise_conv =
#if 0
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
#else
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated
#endif
<GridSize,
BlockSize,
......@@ -180,8 +176,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
BPerBlock,
KPerBlock,
EPerBlock,
......
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class TConstTensorDesc>
......
......@@ -4,7 +4,9 @@
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
......@@ -14,12 +16,12 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
struct GeneratorTensor_1
{
......@@ -438,7 +440,17 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1
#elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......@@ -447,19 +459,9 @@ int main(int argc, char* argv[])
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
......@@ -481,6 +483,16 @@ int main(int argc, char* argv[])
ConvDilations{},
nrepeat);
#elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......@@ -489,19 +501,9 @@ int main(int argc, char* argv[])
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if(do_verification)
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef BFLOAT16_DEVICE_HPP
#define BFLOAT16_DEVICE_HPP
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __HIP_PLATFORM_HCC__
#define EXECUTION_SPECIFIER __device__
#else
#define EXECUTION_SPECIFIER
#endif // MIOPEN_BACKEND_HIP
typedef union
{
uint u32;
ushort2 ushortx2;
// Composable kernels are written in HIP language. The language doesnt support
// ushort2.hi or ushort2.low.
#ifdef __HIP_PLATFORM_HCC__
ushort ushortvec[2];
#endif // MIOPEN_BACKEND_HIP
float f32;
} cvt_bf16_fp32_t;
EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val)
{
cvt_bf16_fp32_t target_val;
#ifdef __HIP_PLATFORM_HCC__
target_val.ushortx2 = make_ushort2(0, src_val);
#else
target_val.ushortx2 = (ushort2)(0, src_val);
#endif
return target_val.f32;
}
EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
{
cvt_bf16_fp32_t target_val;
target_val.f32 = src_val;
// BF16 round and NaN preservation code matches
// https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN
{
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
if((target_val.u32 & 0xffff) != 0)
{
target_val.u32 |= 0x10000; // Preserve signaling NaN
}
}
else
{
#ifdef MIOPEN_USE_RNE_BFLOAT16
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
#ifdef __HIP_PLATFORM_HCC__
target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1));
#else
target_val.u32 +=
(0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even
#endif // MIOPEN_BACKEND_HIP
#endif // MIOPEN_USE_RNE_BFLOAT16
}
#ifdef __HIP_PLATFORM_HCC__
return target_val.ushortvec[1];
#else
return target_val.ushortx2.hi;
#endif // MIOPEN_BACKEND_HIP
}
#ifdef __cplusplus
}
#endif
#endif // BFLOAT16_DEVICE_HPP
#!/bin/bash
export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
export KMDUMPISA=1
export KMDUMPLLVM=1
#export KMOPTLLC="-mattr=+enable-ds128"
export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
export KMDUMPDIR=$PWD
make -j driver
/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
make -j driver
#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment