Commit 32850b93 authored by Wen-Heng (Jack) Chung's avatar Wen-Heng (Jack) Chung
Browse files

Ported xdlops kernels to debug bwdwrw fp32/fp16/bfp16 issue. Verified atleast fwd data fp32 works.

parent 583755a7
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#define MIOPEN_USE_FP16 1
#define MIOPEN_USE_FP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32 0
#define MIOPEN_USE_FP32 1
#define __HIP_PLATFORM_HCC__ 1
......
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#if 0
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#endif
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_USE_INLINE_ASM_XDLOPS
#define CK_USE_INLINE_ASM_XDLOPS 0
#endif
namespace ck {
// float
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float32_t __attribute__((ext_vector_type(32)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
// half
typedef half2 half2_t;
// index_t: used for index calculation
using index_t = uint32_t;
// data type conversion
template <class T>
struct type_convert
{
template <class X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
} // namespace ck
#endif
......@@ -4,6 +4,9 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
......@@ -11,11 +14,22 @@
namespace ck {
// float
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float32_t __attribute__((ext_vector_type(32)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
// half
typedef half2 half2_t;
// index_t: used for index calculation
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
......
......@@ -6,8 +6,11 @@
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
......@@ -22,6 +25,12 @@ using float4_t = float4;
using index_t = uint32_t;
using half2_t = half2;
typedef struct
{
half2 value[2];
} half4_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
......@@ -51,6 +60,31 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
}
#endif
// data type conversion
template <class T>
struct type_convert
{
template <class X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
} // namespace ck
#endif
......@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
template <index_t NBegin, index_t NEnd, index_t Increment>
struct static_for
{
template <class F>
__host__ __device__ constexpr void operator()(F f) const
__host__ __device__ constexpr static_for()
{
static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd");
static_assert((NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
}
template <class F>
__host__ __device__ constexpr void operator()(F f) const
{
static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::type>{}(f);
}
};
......
......@@ -8,106 +8,138 @@
namespace ck {
template <class>
struct is_static : integral_constant<bool, false>
{
};
template <class T, T X>
struct is_static<integral_constant<T, X>> : integral_constant<bool, true>
{
};
template <index_t... Is>
struct is_static<Sequence<Is...>> : integral_constant<bool, true>
{
};
// RemainLengths: Sequence<...>
template <class RemainLengths>
// Orders: Sequence<...>
template <class RemainLengths, class Orders>
struct static_ford_impl
{
// F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...>
template <class F, class CurrentMultiIndex>
__host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
__host__ __device__ constexpr static_ford_impl()
{
static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
}
// F signature: F(Sequence<...>)
// CurrentOrderedId: Sequence<...>
template <class F, class CurrentOrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
{
static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
static_ford_impl<decltype(RemainLengths::PopFront())>{}(f,
CurrentMultiIndex::PushBack(I));
static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, CurrentOrderedId::PushBack(I));
});
}
};
template <>
struct static_ford_impl<Sequence<>>
template <class Orders>
struct static_ford_impl<Sequence<>, Orders>
{
// F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...>
template <class F, class CurrentMultiIndex>
__host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
// F signature: F(Sequence<...>)
// OrderedId: Sequence<...>
template <class F, class OrderedId>
__host__ __device__ constexpr void operator()(F f, OrderedId) const
{
f(CurrentMultiIndex{});
// retrive unordered Id
f(OrderedId::ReorderGivenOld2New(Orders{}));
}
};
// Lengths is Sequence<...>
template <class Lengths>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
struct static_ford
{
__host__ __device__ constexpr static_ford()
{
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
}
// F signature: F(Sequence<...> multi_id)
// multi_id is the unordered multi-index
template <class F>
__host__ __device__ constexpr void operator()(F f) const
{
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
static_ford_impl<Lengths>{}(f, Sequence<>{});
constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
}
};
template <index_t RemainDim>
// RemainLengths: Sequence<...>
// Orders: Sequence<...>
template <class RemainLengths, class Orders>
struct ford_impl
{
// F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template <class F, class CurrentMultiIndex, class RemainLengths>
__host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
__host__ __device__ constexpr ford_impl()
{
static_assert(RemainLengths::GetSize() == RemainDim, "wrong!");
static_assert(RemainDim > 1, "wrong!");
constexpr auto next_length = RemainLengths{}.Front();
static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
}
for(index_t i = 0; i < next_length; ++i)
// F signature: F(Array<...> multi_id)
// CurrentOrderdId: Array<...>
template <class F, class CurrentOrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{
for(index_t i = 0; i < RemainLengths::Front(); ++i)
{
ford_impl<RemainDim - 1>{}(f, current_multi_id.PushBack(i), RemainLengths{}.PopFront());
ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
f, current_ordered_id.PushBack(i));
}
}
};
template <>
struct ford_impl<1>
template <class Orders>
struct ford_impl<Sequence<>, Orders>
{
// F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template <class F, class CurrentMultiIndex, class RemainLengths>
__host__ __device__ constexpr void
operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
// CurrentOrderdId: Array<...>
template <class F, class CurrentOrderedId>
__host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
{
static_assert(RemainLengths::GetSize() == 1, "wrong!");
constexpr index_t last_length = RemainLengths{}.Front();
for(index_t i = 0; i < last_length; ++i)
{
f(current_multi_id.PushBack(i));
}
// retrive unordered Id
f(reorder_array_given_old2new(current_ordered_id, Orders{}));
}
};
// Lengths is Sequence<...>
template <class Lengths>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
struct ford
{
__host__ __device__ constexpr ford()
{
static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
}
// F signature: F(Array<...> multi_id)
// multi_id is the unordered multi-index
template <class F>
__host__ __device__ constexpr void operator()(F f) const
{
constexpr index_t first_length = Lengths{}.Front();
constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
for(index_t i = 0; i < first_length; ++i)
for(index_t i = 0; i < ordered_lengths.Front(); ++i)
{
ford_impl<Lengths::GetSize() - 1>{}(f, Array<index_t, 1>{i}, Lengths{}.PopFront());
ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f, Array<index_t, 1>{i});
}
}
};
......
......@@ -13,30 +13,51 @@ struct integral_constant
__host__ __device__ constexpr value_type operator()() const noexcept { return value; }
};
template <class T, T X, T Y>
__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
template <class X, class Y>
struct is_same : public integral_constant<bool, false>
{
return integral_constant<T, X + Y>{};
}
};
template <class T, T X, T Y>
__host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_constant<T, Y>)
template <class X>
struct is_same<X, X> : public integral_constant<bool, true>
{
return integral_constant<T, X * Y>{};
}
};
template <index_t N>
using Number = integral_constant<index_t, N>;
template <class X, class Y>
struct is_same : public integral_constant<bool, false>
template <index_t X, index_t Y>
__host__ __device__ constexpr auto operator+(Number<X>, Number<Y>)
{
};
return Number<X + Y>{};
}
template <class X>
struct is_same<X, X> : public integral_constant<bool, true>
template <index_t X, index_t Y>
__host__ __device__ constexpr auto operator-(Number<X>, Number<Y>)
{
};
static_assert(Y <= X, "wrong!");
return Number<X - Y>{};
}
template <index_t X, index_t Y>
__host__ __device__ constexpr auto operator*(Number<X>, Number<Y>)
{
return Number<X * Y>{};
}
template <index_t X, index_t Y>
__host__ __device__ constexpr auto operator/(Number<X>, Number<Y>)
{
static_assert(Y > 0, "wrong!");
return Number<X / Y>{};
}
template <index_t X, index_t Y>
__host__ __device__ constexpr auto operator%(Number<X>, Number<Y>)
{
static_assert(Y > 0, "wrong!");
return Number<X % Y>{};
}
} // namespace ck
#endif
......@@ -3,6 +3,7 @@
#include "config.hpp"
#include "integral_constant.hpp"
#include "vector_type.hpp"
namespace ck {
namespace math {
......@@ -42,20 +43,16 @@ struct integer_divide_ceiler
}
};
template <class T>
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
template <class X, class Y>
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
return (x + y - 1) / y;
}
template <class T>
__host__ __device__ constexpr T integer_least_multiple(T a, T b)
template <class X, class Y>
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return b * integer_divide_ceil(a, b);
return y * integer_divide_ceil(x, y);
}
template <class T>
......@@ -102,6 +99,72 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs)
return max(x, xs...);
}
template <class T>
struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(const vector_type<half, 2>::MemoryType& a,
const vector_type<half, 2>::MemoryType& b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(const vector_type<half, 4>::MemoryType& a,
const vector_type<half, 4>::MemoryType& b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(const vector_type<ushort, 2>::MemoryType& a,
const vector_type<ushort, 2>::MemoryType& b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(const vector_type<ushort, 4>::MemoryType& a,
const vector_type<ushort, 4>::MemoryType& b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
};
} // namespace math
} // namspace ck
......
#ifndef CK_VECTOR_TYPE_HPP
#define CK_VECTOR_TYPE_HPP
#if 0
#include "hip/hip_fp16.h"
else
#include "cuda_fp16.h"
#endif
#include "config.hpp"
#include "integral_constant.hpp"
......@@ -10,7 +14,10 @@ namespace ck {
template <class T, index_t N>
struct vector_type
{
T vector[N];
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
......@@ -18,8 +25,6 @@ struct vector_type<float, 1>
{
using MemoryType = float;
__host__ __device__ static constexpr index_t GetSize() { return 1; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
......@@ -33,9 +38,7 @@ struct vector_type<float, 2>
{
using MemoryType = float2_t;
__host__ __device__ static constexpr index_t GetSize() { return 2; }
union Data
union DataType
{
MemoryType vector;
float scalar[2];
......@@ -48,6 +51,13 @@ struct vector_type<float, 2>
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
......@@ -70,8 +80,6 @@ struct vector_type<half, 1>
{
using MemoryType = half;
__host__ __device__ static constexpr index_t GetSize() { return 1; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
......@@ -83,16 +91,14 @@ struct vector_type<half, 1>
template <>
struct vector_type<half, 2>
{
using MemoryType = half2;
using MemoryType = half2_t;
union Data
union DataType
{
MemoryType vector;
half scalar[2];
};
__host__ __device__ static constexpr index_t GetSize() { return 2; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
......@@ -100,17 +106,25 @@ struct vector_type<half, 2>
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
typedef struct MemoryType
{
half2 vector[2];
} MemoryType;
using MemoryType = half4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
union DataType
{
MemoryType vector;
half scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
......@@ -118,15 +132,24 @@ struct vector_type<half, 4>
static_assert(I < 4, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
#if 0
template <>
struct vector_type<ushort, 1>
{
using MemoryType = ushort;
__host__ __device__ static constexpr index_t GetSize() { return 1; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
......@@ -138,16 +161,14 @@ struct vector_type<ushort, 1>
template <>
struct vector_type<ushort, 2>
{
using MemoryType = ushort2;
using MemoryType = ushort2_t;
union Data
union DataType
{
MemoryType vector;
half scalar[2];
ushort scalar[2];
};
__host__ __device__ static constexpr index_t GetSize() { return 2; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
......@@ -155,17 +176,25 @@ struct vector_type<ushort, 2>
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<ushort, 4>
{
typedef struct MemoryType
{
ushort2 vector[2];
} MemoryType;
using MemoryType = ushort4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
union DataType
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
......@@ -173,8 +202,20 @@ struct vector_type<ushort, 4>
static_assert(I < 4, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
#endif
} // namespace ck
#endif
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
#include "implicitgemm_params.hpp"
extern "C" __global__
__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer(
const FLOAT* const __restrict__ p_in_global,
const FLOAT* const __restrict__ p_wei_global,
FLOAT* const __restrict__ p_out_global)
{
using namespace ck;
// read params: problem decription
constexpr index_t N = CK_PARAM_PROBLEM_N;
constexpr index_t K = CK_PARAM_PROBLEM_K;
constexpr index_t C = CK_PARAM_PROBLEM_C;
constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
// read params: tunable params
constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
constexpr index_t CPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
// read params: dependent params
constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto wei_ck_desc = make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<1, C>{});
using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
constexpr index_t GemmMPerThreadSubC = CK_PARAM_GEMM_M_PER_THREAD_SUB_C;
constexpr index_t GemmNPerThreadSubC = CK_PARAM_GEMM_N_PER_THREAD_SUB_C;
constexpr index_t GemmMLevel0Cluster = CK_PARAM_GEMM_M_LEVEL0_CLUSTER;
constexpr index_t GemmNLevel0Cluster = CK_PARAM_GEMM_N_LEVEL0_CLUSTER;
constexpr index_t GemmMLevel1Cluster = CK_PARAM_GEMM_M_LEVEL1_CLUSTER;
constexpr index_t GemmNLevel1Cluster = CK_PARAM_GEMM_N_LEVEL1_CLUSTER;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = GemmMPerThreadSubC;
constexpr index_t GemmDataPerReadB = GemmNPerThreadSubC;
constexpr index_t GemmNRepeat = 2;
constexpr index_t N1 = GemmNRepeat;
constexpr index_t N2 = GemmNPerThreadSubC;
constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
constexpr index_t InBlockCopyClusterLengths_N1 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1;
constexpr index_t InBlockCopyClusterLengths_N2 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2;
constexpr index_t InBlockCopySubLengths_E = CPerBlock / InBlockCopyClusterLengths_E;
constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
constexpr index_t InBlockCopySubLengths_N1 = N1 / InBlockCopyClusterLengths_N1;
constexpr index_t InBlockCopySubLengths_N2 = N2 / InBlockCopyClusterLengths_N2;
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<InBlockCopySubLengths_E,
InBlockCopySubLengths_N1,
InBlockCopySubLengths_B,
InBlockCopySubLengths_N2>;
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<InBlockCopyClusterLengths_E,
InBlockCopyClusterLengths_N1,
InBlockCopyClusterLengths_B,
InBlockCopyClusterLengths_N2>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
constexpr index_t InBlockCopySrcDataPerRead_B = CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
constexpr index_t WeiBlockCopySubLengths_E = CPerBlock / WeiBlockCopyClusterLengths_E;
constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K;
constexpr auto gridwise_conv =
GridwiseConvolutionImplicitGemm_v4_nchw_kc1x1_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_ck_desc),
decltype(out_nkhw_desc),
ConvStrides,
static_cast<ImplicitGemmDirection>(CK_PARAM_PROBLEM_DIRECTION),
BPerBlock,
KPerBlock,
CPerBlock,
N1,
N2,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
}
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
extern "C" __global__
__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer(
const FLOAT* const __restrict__ p_in_global,
const FLOAT* const __restrict__ p_wei_global,
FLOAT* const __restrict__ p_out_global)
{
using namespace ck;
// read params: problem decription
constexpr index_t N = CK_PARAM_PROBLEM_N;
constexpr index_t K = CK_PARAM_PROBLEM_K;
constexpr index_t C = CK_PARAM_PROBLEM_C;
constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
constexpr index_t Y = CK_PARAM_PROBLEM_Y;
constexpr index_t X = CK_PARAM_PROBLEM_X;
constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
// read params: tunable params
constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
// read params: dependent params
constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
constexpr auto tmp_in_nchw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto tmp_out_nkhw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
// wei and out are swapped in the solver
constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto dir = ImplicitGemmDirection::BackwardWeight;
// swap stride and dilation
using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
using ConvStrides = Sequence<ConvDilationH, ConvDilationW>;
#else
constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto dir = ImplicitGemmDirection::ForwardData;
using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr index_t GemmMPerThreadSubC = CK_PARAM_GEMM_M_PER_THREAD_SUB_C;
constexpr index_t GemmNPerThreadSubC = CK_PARAM_GEMM_N_PER_THREAD_SUB_C;
constexpr index_t GemmMLevel0Cluster = CK_PARAM_GEMM_M_LEVEL0_CLUSTER;
constexpr index_t GemmNLevel0Cluster = CK_PARAM_GEMM_N_LEVEL0_CLUSTER;
constexpr index_t GemmMLevel1Cluster = CK_PARAM_GEMM_M_LEVEL1_CLUSTER;
constexpr index_t GemmNLevel1Cluster = CK_PARAM_GEMM_N_LEVEL1_CLUSTER;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmNRepeat = CK_PARAM_GEMM_N_REPEAT;
constexpr index_t N1 = GemmNRepeat;
constexpr index_t N2 = GemmNPerThreadSubC;
constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
constexpr index_t InBlockCopyClusterLengths_N1 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1;
constexpr index_t InBlockCopyClusterLengths_N2 = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2;
constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
constexpr index_t InBlockCopySubLengths_N1 = N1 / InBlockCopyClusterLengths_N1;
constexpr index_t InBlockCopySubLengths_N2 = N2 / InBlockCopyClusterLengths_N2;
constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
#if MIOPEN_USE_FP32
constexpr index_t GemmDataPerReadA = GemmMPerThreadSubC;
constexpr index_t GemmDataPerReadB = GemmNPerThreadSubC;
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<InBlockCopySubLengths_E,
InBlockCopySubLengths_N1,
InBlockCopySubLengths_B,
InBlockCopySubLengths_N2>;
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<InBlockCopyClusterLengths_E,
InBlockCopyClusterLengths_N1,
InBlockCopyClusterLengths_B,
InBlockCopyClusterLengths_N2>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
constexpr index_t InBlockCopySrcDataPerRead_B = CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K;
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
constexpr index_t GemmDataPerReadA = 1;
constexpr index_t GemmDataPerReadB = 1;
constexpr index_t EPACK = CK_PARAM_EPACK_LENGTH;
using InBlockCopySubLengths_E_N1_B_N2_EPACK = Sequence<InBlockCopySubLengths_E,
InBlockCopySubLengths_N1,
InBlockCopySubLengths_B,
InBlockCopySubLengths_N2,
EPACK>;
using InBlockCopyClusterLengths_E_N1_B_N2_EPACK = Sequence<InBlockCopyClusterLengths_E,
InBlockCopyClusterLengths_N1,
InBlockCopyClusterLengths_B,
InBlockCopyClusterLengths_N2,
1>;
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2;
// EPACK - E dimension is folded into 2 dimensions E and EPACK
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2, 4>; // [E, N1, N2, B, EPACK]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2, 4>; // [E, N1, N2, B, EPACK]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3, 4>; // [E, N1, B, N2, EPACK]
using WeiBlockCopySubLengths_E_K_EPACK =
Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPACK>;
using WeiBlockCopyClusterLengths_E_K_EPACK =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPACK]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0, 2>; // [K, E, EPACK]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E, K, EPACK]
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#else
static_assert(false, "wrong! Only kperblock could be 32/64/128 not supported");
#endif
#if MIOPEN_USE_FP32
constexpr auto gridwise_conv =
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
GemmNRepeat,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_N1_B_N2,
InBlockCopyClusterLengths_E_N1_B_N2,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
dir>{};
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
constexpr auto gridwise_conv =
GridwiseConvolutionImplicitGemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
GemmNRepeat,
EPACK,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockCopySubLengths_E_N1_B_N2_EPACK,
InBlockCopyClusterLengths_E_N1_B_N2_EPACK,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopySubLengths_E_K_EPACK,
WeiBlockCopyClusterLengths_E_K_EPACK,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
dir>{};
#else
static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported.");
#endif
gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
}
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
#include "implicitgemm_params.hpp"
extern "C" __global__
__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer(
const FLOAT* const __restrict__ p_in_global,
const FLOAT* const __restrict__ p_wei_global,
FLOAT* const __restrict__ p_out_global)
{
using namespace ck;
// read params: problem decription
constexpr index_t N = CK_PARAM_PROBLEM_N;
constexpr index_t K = CK_PARAM_PROBLEM_K;
constexpr index_t C = CK_PARAM_PROBLEM_C;
constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
// read params: tunable params
constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
// read params: dependent params
constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
// calculate dependent params amd heuristic params
constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto wei_ck_desc = make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<1, C>{});
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
using InBlockCopyClusterLengths_E_B =
Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t InBlockCopyDataPerAccess_B = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
constexpr index_t OutThreadCopyDataPerAccess_B = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE;
constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE;
constexpr auto GemmMWaves = KPerBlock / GemmMPerWave;
constexpr auto GemmNWaves = BPerBlock / GemmNPerWave;
constexpr auto GemmDataPerReadA = 1;
constexpr auto GemmDataPerReadB = 1;
constexpr auto EnableXdlops = CK_ENABLE_XDLOPS == 1;
constexpr auto gridwise_conv =
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
decltype(in_nchw_desc),
decltype(wei_ck_desc),
decltype(out_nkhw_desc),
ConvStrides,
static_cast<ImplicitGemmDirection>(CK_PARAM_PROBLEM_DIRECTION),
BPerBlock,
KPerBlock,
EPerBlock,
GemmMPerWave,
GemmNPerWave,
GemmMWaves,
GemmNWaves,
GemmDataPerReadA,
GemmDataPerReadB,
EnableXdlops,
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B>{};
gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
}
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
extern "C" __global__
__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer(
const FLOAT* const __restrict__ p_in_global,
const FLOAT* const __restrict__ p_wei_global,
FLOAT* const __restrict__ p_out_global)
{
using namespace ck;
// read params: problem decription
constexpr index_t N = CK_PARAM_PROBLEM_N;
constexpr index_t K = CK_PARAM_PROBLEM_K;
constexpr index_t C = CK_PARAM_PROBLEM_C;
constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
constexpr index_t Y = CK_PARAM_PROBLEM_Y;
constexpr index_t X = CK_PARAM_PROBLEM_X;
constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
// read params: tunable params
constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
// read params: dependent params
constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
constexpr auto tmp_in_nchw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto tmp_out_nkhw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
// wei and out are swapped in the solver
constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto dir = ImplicitGemmDirection::BackwardWeight;
// swap stride and dilation
using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
using ConvStrides = Sequence<ConvDilationH, ConvDilationW>;
#else
// calculate dependent params amd heuristic params
constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto dir = ImplicitGemmDirection::ForwardData;
using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
constexpr index_t EPack = CK_PARAM_EPACK_LENGTH;
#if MIOPEN_USE_FP32
using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
using InBlockCopyClusterLengths_E_B =
Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
using InBlockCopySubLengths_E_B =
Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B, EPack>;
using InBlockCopyClusterLengths_E_B =
Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B, 1>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using WeiBlockCopySubLengths_E_K =
Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPack>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPack]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0, 2>; // [K, E, EPack]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E, K, EPack]
#endif
constexpr index_t InBlockCopyDataPerAccess_B = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
constexpr index_t OutThreadCopyDataPerAccess_B = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE;
constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE;
constexpr auto GemmMWaves = KPerBlock / GemmMPerWave;
constexpr auto GemmNWaves = BPerBlock / GemmNPerWave;
constexpr index_t GemmDataPerReadA = 1;
constexpr index_t GemmDataPerReadB = 1;
constexpr auto gridwise_conv =
#if MIOPEN_USE_FP32
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
EPack,
GemmMPerWave,
GemmNPerWave,
GemmMWaves,
GemmNWaves,
GemmDataPerReadA,
GemmDataPerReadB,
(CK_ENABLE_XDLOPS == 1),
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B,
dir>{};
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
EPack,
GemmMPerWave,
GemmNPerWave,
GemmMWaves,
GemmNWaves,
GemmDataPerReadA,
GemmDataPerReadB,
(CK_ENABLE_XDLOPS == 1),
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B,
dir>{};
#else
static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported.");
#endif
gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
}
......@@ -32,6 +32,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X;
printf("H0=%d, W0=%d\n", HO, WO);
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
}
......
#pragma once
#include <unistd.h>
#define MIOPEN_USE_FP16 1
#define MIOPEN_USE_FP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32 0
#define MIOPEN_USE_FP32 1
#define __HIP_PLATFORM_HCC__ 1
......
#pragma once
#include <unistd.h>
#define MIOPEN_USE_FP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32 1
#define __HIP_PLATFORM_HCC__ 1
#include "float_types.h"
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
//#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#define CK_ENABLE_XDLOPS 0
#define CK_PARAM_PROBLEM_DIRECTION 0
#define CK_PARAM_EPACK_LENGTH 1
#define CK_PARAM_TUNABLE_BLOCK_SIZE 64
#define CK_PARAM_TUNABLE_K_PER_BLOCK 32
#define CK_PARAM_TUNABLE_B_PER_BLOCK 64
#define CK_PARAM_TUNABLE_E_PER_BLOCK 8
#define CK_PARAM_DEPENDENT_GRID_SIZE 16
#define CK_PARAM_GEMM_M_PER_WAVE 32
#define CK_PARAM_GEMM_N_PER_WAVE 64
#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E 8
#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B 8
#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E 4
#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K 16
#define CK_PARAM_PROBLEM_CONV_DILATION_W 1
#define CK_PARAM_PROBLEM_CONV_DILATION_H 1
#define CK_PARAM_PROBLEM_CONV_STRIDE_H 1
#define CK_PARAM_PROBLEM_CONV_STRIDE_W 1
#define CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B 1
#define CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E 2
#define CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K 2
#define CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B 1
using namespace ck;
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations>
void device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
index_t nrepeat)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc_org = InDesc{};
constexpr auto wei_kcyx_desc_org = WeiDesc{};
constexpr auto out_nkhw_desc_org = OutDesc{};
constexpr index_t Hi = in_nchw_desc_org.GetLength(I2);
constexpr index_t Wi = in_nchw_desc_org.GetLength(I3);
constexpr index_t N = out_nkhw_desc_org.GetLength(I0);
constexpr index_t Ho = out_nkhw_desc_org.GetLength(I2);
constexpr index_t Wo = out_nkhw_desc_org.GetLength(I3);
constexpr index_t K = wei_kcyx_desc_org.GetLength(I0);
constexpr index_t C = wei_kcyx_desc_org.GetLength(I1);
constexpr index_t Y = wei_kcyx_desc_org.GetLength(I2);
constexpr index_t X = wei_kcyx_desc_org.GetLength(I3);
constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
// read params: tunable params
constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
constexpr index_t BPerBlock = CK_PARAM_TUNABLE_B_PER_BLOCK;
constexpr index_t KPerBlock = CK_PARAM_TUNABLE_K_PER_BLOCK;
constexpr index_t EPerBlock = CK_PARAM_TUNABLE_E_PER_BLOCK;
// read params: dependent params
constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
printf("backward weight is executed\n");
constexpr auto tmp_in_nchw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto tmp_wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto tmp_out_nkhw_desc =
make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto in_nchw_desc = tmp_in_nchw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
// wei and out are swapped in the solver
constexpr auto wei_kcyx_desc = tmp_out_nkhw_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto out_nkhw_desc = tmp_wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});
constexpr auto dir = ImplicitGemmDirection::BackwardWeight;
// swap stride and dilation
// using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
// using ConvStrides = Sequence<ConvDilationH, ConvDilationW>;
#else
printf("forward data is executed\n");
// calculate dependent params amd heuristic params
constexpr auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, Hi, Wi>{});
constexpr auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
constexpr auto dir = ImplicitGemmDirection::ForwardData;
// using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
// using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr index_t InBlockCopyClusterLengths_E = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t InBlockCopyClusterLengths_B = CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B;
constexpr index_t InBlockCopySubLengths_E = EPerBlock / InBlockCopyClusterLengths_E;
constexpr index_t InBlockCopySubLengths_B = BPerBlock / InBlockCopyClusterLengths_B;
constexpr index_t WeiBlockCopyClusterLengths_E = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E;
constexpr index_t WeiBlockCopyClusterLengths_K = CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K;
constexpr index_t WeiBlockCopySubLengths_E = EPerBlock / WeiBlockCopyClusterLengths_E;
constexpr index_t WeiBlockCopySubLengths_K = KPerBlock / WeiBlockCopyClusterLengths_K;
constexpr index_t EPack = CK_PARAM_EPACK_LENGTH;
#if MIOPEN_USE_FP32
printf("fp32 is executed\n");
using InBlockCopySubLengths_E_B = Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B>;
using InBlockCopyClusterLengths_E_B =
Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
using WeiBlockCopySubLengths_E_K = Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
using InBlockCopySubLengths_E_B =
Sequence<InBlockCopySubLengths_E, InBlockCopySubLengths_B, EPack>;
using InBlockCopyClusterLengths_E_B =
Sequence<InBlockCopyClusterLengths_E, InBlockCopyClusterLengths_B, 1>;
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using InBlockCopySrcAccessOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E, B, EPack]
using WeiBlockCopySubLengths_E_K =
Sequence<WeiBlockCopySubLengths_E, WeiBlockCopySubLengths_K, EPack>;
using WeiBlockCopyClusterLengths_E_K =
Sequence<WeiBlockCopyClusterLengths_E, WeiBlockCopyClusterLengths_K, 1>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0, 2>; // [K, E, EPack]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0, 2>; // [K, E, EPack]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E, K, EPack]
#endif
constexpr index_t InBlockCopyDataPerAccess_B = CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B;
constexpr index_t WeiBlockCopySrcDataPerRead_E = CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K;
constexpr index_t OutThreadCopyDataPerAccess_B = CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B;
constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE;
constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE;
constexpr auto GemmMWaves = KPerBlock / GemmMPerWave;
constexpr auto GemmNWaves = BPerBlock / GemmNPerWave;
constexpr index_t GemmDataPerReadA = 1;
constexpr index_t GemmDataPerReadB = 1;
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
// #if MIOPEN_USE_FP16 == 1
// // ES set to 4 as dot4 operator is supported on fp16 in MI100
// constexpr index_t ES = 4;
// #elif MIOPEN_USE_BFP16 == 1
// // ES set to 2 as dot2 operator is supported on bfp16 in MI100
// constexpr index_t ES = 2;
// #else
// // do nothing
// #endif
// constexpr index_t GridSize =
// ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if MIOPEN_USE_FP32 == 1
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
EPack,
GemmMPerWave,
GemmNPerWave,
GemmMWaves,
GemmNWaves,
GemmDataPerReadA,
GemmDataPerReadB,
false,
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B,
dir>{};
#elif MIOPEN_USE_FP16 == 1 || MIOPEN_USE_BFP16 == 1
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer<
GridSize,
BlockSize,
FLOAT,
FLOAT_ACCUM,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
BPerBlock,
KPerBlock,
EPerBlock,
EPack,
GemmMPerWave,
GemmNPerWave,
GemmMWaves,
GemmNWaves,
GemmDataPerReadA,
GemmDataPerReadB,
false,
InBlockCopySubLengths_E_B,
InBlockCopyClusterLengths_E_B,
InBlockCopyThreadClusterArrangeOrder,
InBlockCopySrcAccessOrder,
InBlockCopyDstAccessOrder,
InBlockCopyDataPerAccess_B,
WeiBlockCopySubLengths_E_K,
WeiBlockCopyClusterLengths_E_K,
WeiBlockCopyThreadClusterArrangeOrder,
WeiBlockCopySrcAccessOrder,
WeiBlockCopyDstAccessOrder,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K,
OutThreadCopyDataPerAccess_B,
dir>{};
#endif
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
......@@ -8,11 +8,12 @@
#include "device.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
// #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
// #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
// #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
// #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp"
using namespace ck;
......@@ -400,6 +401,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
std::cout << result.mData[i] << " ";
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
......@@ -410,6 +412,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
}
}
std::cout << std::endl;
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
......@@ -803,7 +806,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
constexpr index_t N = 8;
constexpr index_t N = 32;
constexpr index_t C = 64;
constexpr index_t HI = 4;
constexpr index_t WI = 4;
......@@ -830,8 +833,8 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
using in_data_t = half;
using out_data_t = half;
using in_data_t = float;
using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
......@@ -850,7 +853,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
#if 0
#if 1
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0
......@@ -859,7 +862,7 @@ int main(int argc, char* argv[])
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0
......@@ -883,8 +886,10 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif 1
#elif 0
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
#elif 1
device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw
#endif
(in_nchw_desc,
in_nchw,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment