Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
32850b93
Commit
32850b93
authored
Oct 09, 2019
by
Wen-Heng (Jack) Chung
Browse files
Ported xdlops kernels to debug bwdwrw fp32/fp16/bfp16 issue. Verified atleast fwd data fp32 works.
parent
583755a7
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1462 additions
and
123 deletions
+1462
-123
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+2
-2
composable_kernel/include/utility/config.hpp.bkup
composable_kernel/include/utility/config.hpp.bkup
+70
-0
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+14
-0
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+34
-0
composable_kernel/include/utility/functional2.hpp
composable_kernel/include/utility/functional2.hpp
+5
-3
composable_kernel/include/utility/functional3.hpp
composable_kernel/include/utility/functional3.hpp
+84
-52
composable_kernel/include/utility/integral_constant.hpp
composable_kernel/include/utility/integral_constant.hpp
+35
-14
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+73
-10
composable_kernel/include/utility/vector_type.hpp
composable_kernel/include/utility/vector_type.hpp
+70
-29
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
...on_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+139
-0
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
...ion_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
+256
-0
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
...it_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
+124
-0
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
...cit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+217
-0
driver/include/conv_common.hpp
driver/include/conv_common.hpp
+1
-0
driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
...de/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+2
-2
driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
...de/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
+320
-0
driver/src/driver.cpp
driver/src/driver.cpp
+16
-11
No files found.
composable_kernel/include/utility/common_header.hpp
View file @
32850b93
#ifndef CK_COMMON_HEADER_HPP
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#define MIOPEN_USE_FP16
1
#define MIOPEN_USE_FP16
0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32
0
#define MIOPEN_USE_FP32
1
#define __HIP_PLATFORM_HCC__ 1
#define __HIP_PLATFORM_HCC__ 1
...
...
composable_kernel/include/utility/config.hpp.bkup
0 → 100644
View file @
32850b93
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#if 0
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#endif
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_USE_INLINE_ASM_XDLOPS
#define CK_USE_INLINE_ASM_XDLOPS 0
#endif
namespace ck {
// float
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float32_t __attribute__((ext_vector_type(32)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
// half
typedef half2 half2_t;
// index_t: used for index calculation
using index_t = uint32_t;
// data type conversion
template <class T>
struct type_convert
{
template <class X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
} // namespace ck
#endif
composable_kernel/include/utility/config_amd.hpp.in
View file @
32850b93
...
@@ -4,6 +4,9 @@
...
@@ -4,6 +4,9 @@
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
@@ -11,11 +14,22 @@
...
@@ -11,11 +14,22 @@
namespace ck {
namespace ck {
// float
// For some reason, HIP compiler need this definition to generate optimal load and store
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
// instruction
typedef float float32_t __attribute__((ext_vector_type(32)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
// half
typedef half2 half2_t;
// index_t: used for index calculation
using index_t = uint32_t;
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
32850b93
...
@@ -6,8 +6,11 @@
...
@@ -6,8 +6,11 @@
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#include "bfloat16_dev.hpp"
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
...
@@ -22,6 +25,12 @@ using float4_t = float4;
...
@@ -22,6 +25,12 @@ using float4_t = float4;
using index_t = uint32_t;
using index_t = uint32_t;
using half2_t = half2;
typedef struct
{
half2 value[2];
} half4_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
{
d += s0 * s1;
d += s0 * s1;
...
@@ -51,6 +60,31 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
...
@@ -51,6 +60,31 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
}
}
#endif
#endif
// data type conversion
template <class T>
struct type_convert
{
template <class X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
} // namespace ck
} // namespace ck
#endif
#endif
composable_kernel/include/utility/functional2.hpp
View file @
32850b93
...
@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
...
@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
struct
static_for
struct
static_for
{
{
template
<
class
F
>
__host__
__device__
constexpr
static_for
()
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
}
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
type
>
{}(
f
);
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
type
>
{}(
f
);
}
}
};
};
...
...
composable_kernel/include/utility/functional3.hpp
View file @
32850b93
...
@@ -8,106 +8,138 @@
...
@@ -8,106 +8,138 @@
namespace
ck
{
namespace
ck
{
template
<
class
>
struct
is_static
:
integral_constant
<
bool
,
false
>
{
};
template
<
class
T
,
T
X
>
struct
is_static
<
integral_constant
<
T
,
X
>>
:
integral_constant
<
bool
,
true
>
{
};
template
<
index_t
...
Is
>
struct
is_static
<
Sequence
<
Is
...
>>
:
integral_constant
<
bool
,
true
>
{
};
// RemainLengths: Sequence<...>
// RemainLengths: Sequence<...>
template
<
class
RemainLengths
>
// Orders: Sequence<...>
template
<
class
RemainLengths
,
class
Orders
>
struct
static_ford_impl
struct
static_ford_impl
{
{
// F signature: F(Sequence<...> multi_id)
__host__
__device__
constexpr
static_ford_impl
()
// CurrentMultiIndex: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
{
{
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
}
// F signature: F(Sequence<...>)
// CurrentOrderedId: Sequence<...>
template
<
class
F
,
class
CurrentOrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
)
const
{
static_for
<
0
,
RemainLengths
::
Front
(),
1
>
{}([
=
](
auto
I
)
{
static_for
<
0
,
RemainLengths
::
Front
(),
1
>
{}([
=
](
auto
I
)
{
static_ford_impl
<
decltype
(
RemainLengths
::
PopFront
())
>
{}(
f
,
static_ford_impl
<
decltype
(
RemainLengths
::
PopFront
())
,
Orders
>
{}(
CurrentMultiIndex
::
PushBack
(
I
));
f
,
CurrentOrderedId
::
PushBack
(
I
));
});
});
}
}
};
};
template
<
>
template
<
class
Orders
>
struct
static_ford_impl
<
Sequence
<>>
struct
static_ford_impl
<
Sequence
<>
,
Orders
>
{
{
// F signature: F(Sequence<...>
multi_id
)
// F signature: F(Sequence<...>)
//
CurrentMultiIndex
: Sequence<...>
//
OrderedId
: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
template
<
class
F
,
class
OrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
,
OrderedId
)
const
{
{
f
(
CurrentMultiIndex
{});
// retrive unordered Id
f
(
OrderedId
::
ReorderGivenOld2New
(
Orders
{}));
}
}
};
};
// Lengths is Sequence<...>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
template
<
class
Lengths
>
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
// dimension
template
<
class
Lengths
,
class
Orders
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>
::
type
>
struct
static_ford
struct
static_ford
{
{
__host__
__device__
constexpr
static_ford
()
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
static_assert
(
Lengths
::
GetSize
()
==
Orders
::
GetSize
(),
"wrong! inconsistent size"
);
}
// F signature: F(Sequence<...> multi_id)
// F signature: F(Sequence<...> multi_id)
// multi_id is the unordered multi-index
template
<
class
F
>
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
constexpr
auto
ordered_lengths
=
Lengths
::
ReorderGivenNew2Old
(
Orders
{});
static_ford_impl
<
decltype
(
ordered_lengths
),
Orders
>
{}(
f
,
Sequence
<>
{});
static_ford_impl
<
Lengths
>
{}(
f
,
Sequence
<>
{});
}
}
};
};
template
<
index_t
RemainDim
>
// RemainLengths: Sequence<...>
// Orders: Sequence<...>
template
<
class
RemainLengths
,
class
Orders
>
struct
ford_impl
struct
ford_impl
{
{
// F signature: F(Array<...> multi_id)
__host__
__device__
constexpr
ford_impl
()
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
{
{
static_assert
(
RemainLengths
::
GetSize
()
==
RemainDim
,
"wrong!"
);
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
static_assert
(
RemainDim
>
1
,
"wrong!"
);
}
constexpr
auto
next_length
=
RemainLengths
{}.
Front
();
for
(
index_t
i
=
0
;
i
<
next_length
;
++
i
)
// F signature: F(Array<...> multi_id)
// CurrentOrderdId: Array<...>
template
<
class
F
,
class
CurrentOrderedId
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
current_ordered_id
)
const
{
for
(
index_t
i
=
0
;
i
<
RemainLengths
::
Front
();
++
i
)
{
{
ford_impl
<
RemainDim
-
1
>
{}(
f
,
current_multi_id
.
PushBack
(
i
),
RemainLengths
{}.
PopFront
());
ford_impl
<
decltype
(
RemainLengths
::
PopFront
()),
Orders
>
{}(
f
,
current_ordered_id
.
PushBack
(
i
));
}
}
}
}
};
};
template
<
>
template
<
class
Orders
>
struct
ford_impl
<
1
>
struct
ford_impl
<
Sequence
<>
,
Orders
>
{
{
// F signature: F(Array<...> multi_id)
// F signature: F(Array<...> multi_id)
// CurrentMultiIndex: Array<...>
// CurrentOrderdId: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentOrderedId
>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentOrderedId
current_ordered_id
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
{
{
static_assert
(
RemainLengths
::
GetSize
()
==
1
,
"wrong!"
);
// retrive unordered Id
f
(
reorder_array_given_old2new
(
current_ordered_id
,
Orders
{}));
constexpr
index_t
last_length
=
RemainLengths
{}.
Front
();
for
(
index_t
i
=
0
;
i
<
last_length
;
++
i
)
{
f
(
current_multi_id
.
PushBack
(
i
));
}
}
}
};
};
// Lengths is Sequence<...>
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
template
<
class
Lengths
>
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
// dimension
template
<
class
Lengths
,
class
Orders
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>
::
type
>
struct
ford
struct
ford
{
{
__host__
__device__
constexpr
ford
()
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
static_assert
(
Lengths
::
GetSize
()
==
Orders
::
GetSize
(),
"wrong! inconsistent size"
);
}
// F signature: F(Array<...> multi_id)
// F signature: F(Array<...> multi_id)
// multi_id is the unordered multi-index
template
<
class
F
>
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
{
constexpr
index_t
first
_length
=
Lengths
{}.
Front
(
);
constexpr
auto
ordered
_length
s
=
Lengths
::
ReorderGivenNew2Old
(
Orders
{}
);
for
(
index_t
i
=
0
;
i
<
first_length
;
++
i
)
for
(
index_t
i
=
0
;
i
<
ordered_lengths
.
Front
()
;
++
i
)
{
{
ford_impl
<
Lengths
::
GetSize
()
-
1
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
}
,
Lengths
{}.
PopFront
()
);
ford_impl
<
decltype
(
ordered_lengths
.
PopFront
()),
Orders
>
{}(
f
,
Array
<
index_t
,
1
>
{
i
});
}
}
}
}
};
};
...
...
composable_kernel/include/utility/integral_constant.hpp
View file @
32850b93
...
@@ -13,30 +13,51 @@ struct integral_constant
...
@@ -13,30 +13,51 @@ struct integral_constant
__host__
__device__
constexpr
value_type
operator
()()
const
noexcept
{
return
value
;
}
__host__
__device__
constexpr
value_type
operator
()()
const
noexcept
{
return
value
;
}
};
};
template
<
class
T
,
T
X
,
T
Y
>
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
auto
operator
+
(
integral_constant
<
T
,
X
>
,
integral_constant
<
T
,
Y
>
)
struct
is_same
:
public
integral_constant
<
bool
,
false
>
{
{
return
integral_constant
<
T
,
X
+
Y
>
{};
};
}
template
<
class
T
,
T
X
,
T
Y
>
template
<
class
X
>
__host__
__device__
constexpr
auto
operator
*
(
integral_constant
<
T
,
X
>
,
integral_constant
<
T
,
Y
>
)
struct
is_same
<
X
,
X
>
:
public
integral_constant
<
bool
,
true
>
{
{
return
integral_constant
<
T
,
X
*
Y
>
{};
};
}
template
<
index_t
N
>
template
<
index_t
N
>
using
Number
=
integral_constant
<
index_t
,
N
>
;
using
Number
=
integral_constant
<
index_t
,
N
>
;
template
<
class
X
,
class
Y
>
template
<
index_t
X
,
index_t
Y
>
struct
is_same
:
public
integral_constant
<
bool
,
false
>
__host__
__device__
constexpr
auto
operator
+
(
Number
<
X
>
,
Number
<
Y
>
)
{
{
};
return
Number
<
X
+
Y
>
{};
}
template
<
class
X
>
template
<
index_t
X
,
index_t
Y
>
struct
is_same
<
X
,
X
>
:
public
integral_constant
<
bool
,
true
>
__host__
__device__
constexpr
auto
operator
-
(
Number
<
X
>
,
Number
<
Y
>
)
{
{
};
static_assert
(
Y
<=
X
,
"wrong!"
);
return
Number
<
X
-
Y
>
{};
}
template
<
index_t
X
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
*
(
Number
<
X
>
,
Number
<
Y
>
)
{
return
Number
<
X
*
Y
>
{};
}
template
<
index_t
X
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
/
(
Number
<
X
>
,
Number
<
Y
>
)
{
static_assert
(
Y
>
0
,
"wrong!"
);
return
Number
<
X
/
Y
>
{};
}
template
<
index_t
X
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
%
(
Number
<
X
>
,
Number
<
Y
>
)
{
static_assert
(
Y
>
0
,
"wrong!"
);
return
Number
<
X
%
Y
>
{};
}
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/math.hpp
View file @
32850b93
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include "config.hpp"
#include "config.hpp"
#include "integral_constant.hpp"
#include "integral_constant.hpp"
#include "vector_type.hpp"
namespace
ck
{
namespace
ck
{
namespace
math
{
namespace
math
{
...
@@ -42,20 +43,16 @@ struct integer_divide_ceiler
...
@@ -42,20 +43,16 @@ struct integer_divide_ceiler
}
}
};
};
template
<
class
T
>
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
T
integer_divide_ceil
(
T
a
,
T
b
)
__host__
__device__
constexpr
auto
integer_divide_ceil
(
X
x
,
Y
y
)
{
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
x
+
y
-
1
)
/
y
;
return
(
a
+
b
-
1
)
/
b
;
}
}
template
<
class
T
>
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
T
integer_least_multiple
(
T
a
,
T
b
)
__host__
__device__
constexpr
auto
integer_least_multiple
(
X
x
,
Y
y
)
{
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
y
*
integer_divide_ceil
(
x
,
y
);
return
b
*
integer_divide_ceil
(
a
,
b
);
}
}
template
<
class
T
>
template
<
class
T
>
...
@@ -102,6 +99,72 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs)
...
@@ -102,6 +99,72 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs)
return
max
(
x
,
xs
...);
return
max
(
x
,
xs
...);
}
}
template
<
class
T
>
struct
inner_product_with_conversion
{
static
constexpr
auto
convert
=
type_convert
<
T
>
();
__device__
T
operator
()(
float
a
,
float
b
)
const
{
return
convert
(
a
)
*
convert
(
b
);
}
__device__
T
operator
()(
const
vector_type
<
half
,
2
>::
MemoryType
&
a
,
const
vector_type
<
half
,
2
>::
MemoryType
&
b
)
const
{
const
half
*
p_a_half
=
reinterpret_cast
<
const
half
*>
(
&
a
);
const
half
*
p_b_half
=
reinterpret_cast
<
const
half
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
2
;
++
v
)
{
acc
+=
convert
(
p_a_half
[
v
])
*
convert
(
p_b_half
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
const
vector_type
<
half
,
4
>::
MemoryType
&
a
,
const
vector_type
<
half
,
4
>::
MemoryType
&
b
)
const
{
const
half
*
p_a_half
=
reinterpret_cast
<
const
half
*>
(
&
a
);
const
half
*
p_b_half
=
reinterpret_cast
<
const
half
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
4
;
++
v
)
{
acc
+=
convert
(
p_a_half
[
v
])
*
convert
(
p_b_half
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
const
vector_type
<
ushort
,
2
>::
MemoryType
&
a
,
const
vector_type
<
ushort
,
2
>::
MemoryType
&
b
)
const
{
const
ushort
*
p_a_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
a
);
const
ushort
*
p_b_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
2
;
++
v
)
{
acc
+=
convert
(
p_a_bfloat16
[
v
])
*
convert
(
p_b_bfloat16
[
v
]);
}
return
acc
;
}
__device__
T
operator
()(
const
vector_type
<
ushort
,
4
>::
MemoryType
&
a
,
const
vector_type
<
ushort
,
4
>::
MemoryType
&
b
)
const
{
const
ushort
*
p_a_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
a
);
const
ushort
*
p_b_bfloat16
=
reinterpret_cast
<
const
ushort
*>
(
&
b
);
T
acc
=
0
;
for
(
index_t
v
=
0
;
v
<
4
;
++
v
)
{
acc
+=
convert
(
p_a_bfloat16
[
v
])
*
convert
(
p_b_bfloat16
[
v
]);
}
return
acc
;
}
};
}
// namespace math
}
// namespace math
}
// namspace ck
}
// namspace ck
...
...
composable_kernel/include/utility/vector_type.hpp
View file @
32850b93
#ifndef CK_VECTOR_TYPE_HPP
#ifndef CK_VECTOR_TYPE_HPP
#define CK_VECTOR_TYPE_HPP
#define CK_VECTOR_TYPE_HPP
#if 0
#include "hip/hip_fp16.h"
else
#include "cuda_fp16.h"
#include "cuda_fp16.h"
#endif
#include "config.hpp"
#include "config.hpp"
#include "integral_constant.hpp"
#include "integral_constant.hpp"
...
@@ -10,7 +14,10 @@ namespace ck {
...
@@ -10,7 +14,10 @@ namespace ck {
template
<
class
T
,
index_t
N
>
template
<
class
T
,
index_t
N
>
struct
vector_type
struct
vector_type
{
{
T
vector
[
N
];
typedef
struct
{
T
scalar
[
N
];
}
MemoryType
;
};
};
template
<
>
template
<
>
...
@@ -18,8 +25,6 @@ struct vector_type<float, 1>
...
@@ -18,8 +25,6 @@ struct vector_type<float, 1>
{
{
using
MemoryType
=
float
;
using
MemoryType
=
float
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
1
;
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
{
{
...
@@ -33,9 +38,7 @@ struct vector_type<float, 2>
...
@@ -33,9 +38,7 @@ struct vector_type<float, 2>
{
{
using
MemoryType
=
float2_t
;
using
MemoryType
=
float2_t
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
2
;
}
union
DataType
union
Data
{
{
MemoryType
vector
;
MemoryType
vector
;
float
scalar
[
2
];
float
scalar
[
2
];
...
@@ -48,6 +51,13 @@ struct vector_type<float, 2>
...
@@ -48,6 +51,13 @@ struct vector_type<float, 2>
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
*
(
reinterpret_cast
<
float
*>
(
&
v
)
+
I
)
=
s
;
}
}
__host__
__device__
static
MemoryType
Pack
(
float
s0
,
float
s1
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
};
template
<
>
template
<
>
...
@@ -70,8 +80,6 @@ struct vector_type<half, 1>
...
@@ -70,8 +80,6 @@ struct vector_type<half, 1>
{
{
using
MemoryType
=
half
;
using
MemoryType
=
half
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
1
;
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
{
{
...
@@ -83,16 +91,14 @@ struct vector_type<half, 1>
...
@@ -83,16 +91,14 @@ struct vector_type<half, 1>
template
<
>
template
<
>
struct
vector_type
<
half
,
2
>
struct
vector_type
<
half
,
2
>
{
{
using
MemoryType
=
half2
;
using
MemoryType
=
half2
_t
;
union
Data
union
Data
Type
{
{
MemoryType
vector
;
MemoryType
vector
;
half
scalar
[
2
];
half
scalar
[
2
];
};
};
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
2
;
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
{
{
...
@@ -100,17 +106,25 @@ struct vector_type<half, 2>
...
@@ -100,17 +106,25 @@ struct vector_type<half, 2>
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
}
}
__host__
__device__
static
MemoryType
Pack
(
half
s0
,
half
s1
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
return
data
.
vector
;
}
};
};
template
<
>
template
<
>
struct
vector_type
<
half
,
4
>
struct
vector_type
<
half
,
4
>
{
{
typedef
struct
MemoryType
using
MemoryType
=
half4_t
;
{
half2
vector
[
2
];
}
MemoryType
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
4
;
}
union
DataType
{
MemoryType
vector
;
half
scalar
[
4
];
};
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
half
s
,
Number
<
I
>
)
...
@@ -118,15 +132,24 @@ struct vector_type<half, 4>
...
@@ -118,15 +132,24 @@ struct vector_type<half, 4>
static_assert
(
I
<
4
,
"wrong"
);
static_assert
(
I
<
4
,
"wrong"
);
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
*
(
reinterpret_cast
<
half
*>
(
&
v
)
+
I
)
=
s
;
}
}
__host__
__device__
static
MemoryType
Pack
(
half
s0
,
half
s1
,
half
s2
,
half
s3
)
{
DataType
data
;
data
.
scalar
[
0
]
=
s0
;
data
.
scalar
[
1
]
=
s1
;
data
.
scalar
[
2
]
=
s2
;
data
.
scalar
[
3
]
=
s3
;
return
data
.
vector
;
}
};
};
#if 0
template <>
template <>
struct vector_type<ushort, 1>
struct vector_type<ushort, 1>
{
{
using MemoryType = ushort;
using MemoryType = ushort;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
1
;
}
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
{
...
@@ -138,16 +161,14 @@ struct vector_type<ushort, 1>
...
@@ -138,16 +161,14 @@ struct vector_type<ushort, 1>
template <>
template <>
struct vector_type<ushort, 2>
struct vector_type<ushort, 2>
{
{
using
MemoryType
=
ushort2
;
using MemoryType = ushort2
_t
;
union
Data
union Data
Type
{
{
MemoryType vector;
MemoryType vector;
half
scalar
[
2
];
ushort
scalar[2];
};
};
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
2
;
}
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
{
...
@@ -155,17 +176,25 @@ struct vector_type<ushort, 2>
...
@@ -155,17 +176,25 @@ struct vector_type<ushort, 2>
*(reinterpret_cast<ushort*>(&v) + I) = s;
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
};
template <>
template <>
struct vector_type<ushort, 4>
struct vector_type<ushort, 4>
{
{
typedef
struct
MemoryType
using MemoryType = ushort4_t;
{
ushort2
vector
[
2
];
}
MemoryType
;
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
4
;
}
union DataType
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I>
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
...
@@ -173,8 +202,20 @@ struct vector_type<ushort, 4>
...
@@ -173,8 +202,20 @@ struct vector_type<ushort, 4>
static_assert(I < 4, "wrong");
static_assert(I < 4, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
};
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.cpp
0 → 100644
View file @
32850b93
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
#include "implicitgemm_params.hpp"
extern
"C"
__global__
__launch_bounds__
(
CK_PARAM_TUNABLE_BLOCK_SIZE
,
2
)
void
gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer
(
const
FLOAT
*
const
__restrict__
p_in_global
,
const
FLOAT
*
const
__restrict__
p_wei_global
,
FLOAT
*
const
__restrict__
p_out_global
)
{
using
namespace
ck
;
// read params: problem decription
constexpr
index_t
N
=
CK_PARAM_PROBLEM_N
;
constexpr
index_t
K
=
CK_PARAM_PROBLEM_K
;
constexpr
index_t
C
=
CK_PARAM_PROBLEM_C
;
constexpr
index_t
Hi
=
CK_PARAM_PROBLEM_HI
;
constexpr
index_t
Wi
=
CK_PARAM_PROBLEM_WI
;
constexpr
index_t
Ho
=
CK_PARAM_PROBLEM_HO
;
constexpr
index_t
Wo
=
CK_PARAM_PROBLEM_WO
;
constexpr
index_t
ConvStrideH
=
CK_PARAM_PROBLEM_CONV_STRIDE_H
;
constexpr
index_t
ConvStrideW
=
CK_PARAM_PROBLEM_CONV_STRIDE_W
;
// read params: tunable params
constexpr
index_t
BlockSize
=
CK_PARAM_TUNABLE_BLOCK_SIZE
;
constexpr
index_t
BPerBlock
=
CK_PARAM_TUNABLE_B_PER_BLOCK
;
constexpr
index_t
KPerBlock
=
CK_PARAM_TUNABLE_K_PER_BLOCK
;
constexpr
index_t
CPerBlock
=
CK_PARAM_TUNABLE_E_PER_BLOCK
;
// read params: dependent params
constexpr
index_t
GridSize
=
CK_PARAM_DEPENDENT_GRID_SIZE
;
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
wei_ck_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
1
,
C
>
{});
using
ConvStrides
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
constexpr
index_t
GemmMPerThreadSubC
=
CK_PARAM_GEMM_M_PER_THREAD_SUB_C
;
constexpr
index_t
GemmNPerThreadSubC
=
CK_PARAM_GEMM_N_PER_THREAD_SUB_C
;
constexpr
index_t
GemmMLevel0Cluster
=
CK_PARAM_GEMM_M_LEVEL0_CLUSTER
;
constexpr
index_t
GemmNLevel0Cluster
=
CK_PARAM_GEMM_N_LEVEL0_CLUSTER
;
constexpr
index_t
GemmMLevel1Cluster
=
CK_PARAM_GEMM_M_LEVEL1_CLUSTER
;
constexpr
index_t
GemmNLevel1Cluster
=
CK_PARAM_GEMM_N_LEVEL1_CLUSTER
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
GemmMPerThreadSubC
;
constexpr
index_t
GemmDataPerReadB
=
GemmNPerThreadSubC
;
constexpr
index_t
GemmNRepeat
=
2
;
constexpr
index_t
N1
=
GemmNRepeat
;
constexpr
index_t
N2
=
GemmNPerThreadSubC
;
constexpr
index_t
InBlockCopyClusterLengths_E
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
InBlockCopyClusterLengths_B
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B
;
constexpr
index_t
InBlockCopyClusterLengths_N1
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1
;
constexpr
index_t
InBlockCopyClusterLengths_N2
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2
;
constexpr
index_t
InBlockCopySubLengths_E
=
CPerBlock
/
InBlockCopyClusterLengths_E
;
constexpr
index_t
InBlockCopySubLengths_B
=
BPerBlock
/
InBlockCopyClusterLengths_B
;
constexpr
index_t
InBlockCopySubLengths_N1
=
N1
/
InBlockCopyClusterLengths_N1
;
constexpr
index_t
InBlockCopySubLengths_N2
=
N2
/
InBlockCopyClusterLengths_N2
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_N1
,
InBlockCopySubLengths_B
,
InBlockCopySubLengths_N2
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_N1
,
InBlockCopyClusterLengths_B
,
InBlockCopyClusterLengths_N2
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2
;
constexpr
index_t
WeiBlockCopyClusterLengths_E
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K
;
constexpr
index_t
WeiBlockCopySubLengths_E
=
CPerBlock
/
WeiBlockCopyClusterLengths_E
;
constexpr
index_t
WeiBlockCopySubLengths_K
=
KPerBlock
/
WeiBlockCopyClusterLengths_K
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K
;
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4_nchw_kc1x1_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_ck_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
static_cast
<
ImplicitGemmDirection
>
(
CK_PARAM_PROBLEM_DIRECTION
),
BPerBlock
,
KPerBlock
,
CPerBlock
,
N1
,
N2
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
gridwise_conv
.
Run
(
p_in_global
,
p_wei_global
,
p_out_global
);
}
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.cpp
0 → 100644
View file @
32850b93
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
extern
"C"
__global__
__launch_bounds__
(
CK_PARAM_TUNABLE_BLOCK_SIZE
,
2
)
void
gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer
(
const
FLOAT
*
const
__restrict__
p_in_global
,
const
FLOAT
*
const
__restrict__
p_wei_global
,
FLOAT
*
const
__restrict__
p_out_global
)
{
using
namespace
ck
;
// read params: problem decription
constexpr
index_t
N
=
CK_PARAM_PROBLEM_N
;
constexpr
index_t
K
=
CK_PARAM_PROBLEM_K
;
constexpr
index_t
C
=
CK_PARAM_PROBLEM_C
;
constexpr
index_t
Hi
=
CK_PARAM_PROBLEM_HI
;
constexpr
index_t
Wi
=
CK_PARAM_PROBLEM_WI
;
constexpr
index_t
Ho
=
CK_PARAM_PROBLEM_HO
;
constexpr
index_t
Wo
=
CK_PARAM_PROBLEM_WO
;
constexpr
index_t
Y
=
CK_PARAM_PROBLEM_Y
;
constexpr
index_t
X
=
CK_PARAM_PROBLEM_X
;
constexpr
index_t
ConvStrideH
=
CK_PARAM_PROBLEM_CONV_STRIDE_H
;
constexpr
index_t
ConvStrideW
=
CK_PARAM_PROBLEM_CONV_STRIDE_W
;
constexpr
index_t
ConvDilationH
=
CK_PARAM_PROBLEM_CONV_DILATION_H
;
constexpr
index_t
ConvDilationW
=
CK_PARAM_PROBLEM_CONV_DILATION_W
;
// read params: tunable params
constexpr
index_t
BlockSize
=
CK_PARAM_TUNABLE_BLOCK_SIZE
;
constexpr
index_t
BPerBlock
=
CK_PARAM_TUNABLE_B_PER_BLOCK
;
constexpr
index_t
KPerBlock
=
CK_PARAM_TUNABLE_K_PER_BLOCK
;
constexpr
index_t
EPerBlock
=
CK_PARAM_TUNABLE_E_PER_BLOCK
;
// read params: dependent params
constexpr
index_t
GridSize
=
CK_PARAM_DEPENDENT_GRID_SIZE
;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
constexpr
auto
tmp_in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
tmp_wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
tmp_out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
in_nchw_desc
=
tmp_in_nchw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
// wei and out are swapped in the solver
constexpr
auto
wei_kcyx_desc
=
tmp_out_nkhw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
out_nkhw_desc
=
tmp_wei_kcyx_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
BackwardWeight
;
// swap stride and dilation
using
ConvDilations
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
using
ConvStrides
=
Sequence
<
ConvDilationH
,
ConvDilationW
>
;
#else
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
ForwardData
;
using
ConvStrides
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
using
ConvDilations
=
Sequence
<
ConvDilationH
,
ConvDilationW
>
;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr
index_t
GemmMPerThreadSubC
=
CK_PARAM_GEMM_M_PER_THREAD_SUB_C
;
constexpr
index_t
GemmNPerThreadSubC
=
CK_PARAM_GEMM_N_PER_THREAD_SUB_C
;
constexpr
index_t
GemmMLevel0Cluster
=
CK_PARAM_GEMM_M_LEVEL0_CLUSTER
;
constexpr
index_t
GemmNLevel0Cluster
=
CK_PARAM_GEMM_N_LEVEL0_CLUSTER
;
constexpr
index_t
GemmMLevel1Cluster
=
CK_PARAM_GEMM_M_LEVEL1_CLUSTER
;
constexpr
index_t
GemmNLevel1Cluster
=
CK_PARAM_GEMM_N_LEVEL1_CLUSTER
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmNRepeat
=
CK_PARAM_GEMM_N_REPEAT
;
constexpr
index_t
N1
=
GemmNRepeat
;
constexpr
index_t
N2
=
GemmNPerThreadSubC
;
constexpr
index_t
InBlockCopyClusterLengths_E
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
InBlockCopyClusterLengths_B
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B
;
constexpr
index_t
InBlockCopyClusterLengths_N1
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N1
;
constexpr
index_t
InBlockCopyClusterLengths_N2
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_N2
;
constexpr
index_t
InBlockCopySubLengths_E
=
EPerBlock
/
InBlockCopyClusterLengths_E
;
constexpr
index_t
InBlockCopySubLengths_B
=
BPerBlock
/
InBlockCopyClusterLengths_B
;
constexpr
index_t
InBlockCopySubLengths_N1
=
N1
/
InBlockCopyClusterLengths_N1
;
constexpr
index_t
InBlockCopySubLengths_N2
=
N2
/
InBlockCopyClusterLengths_N2
;
constexpr
index_t
WeiBlockCopyClusterLengths_E
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K
;
constexpr
index_t
WeiBlockCopySubLengths_E
=
EPerBlock
/
WeiBlockCopyClusterLengths_E
;
constexpr
index_t
WeiBlockCopySubLengths_K
=
KPerBlock
/
WeiBlockCopyClusterLengths_K
;
#if MIOPEN_USE_FP32
constexpr
index_t
GemmDataPerReadA
=
GemmMPerThreadSubC
;
constexpr
index_t
GemmDataPerReadB
=
GemmNPerThreadSubC
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_N1
,
InBlockCopySubLengths_B
,
InBlockCopySubLengths_N2
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_N1
,
InBlockCopyClusterLengths_B
,
InBlockCopyClusterLengths_N2
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
CK_PARAM_IN_BLOCK_COPY_SRC_DATA_PER_READ_B
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
CK_PARAM_WEI_BLOCK_COPY_DST_DATE_PER_WRITE_K
;
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
constexpr
index_t
GemmDataPerReadA
=
1
;
constexpr
index_t
GemmDataPerReadB
=
1
;
constexpr
index_t
EPACK
=
CK_PARAM_EPACK_LENGTH
;
using
InBlockCopySubLengths_E_N1_B_N2_EPACK
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_N1
,
InBlockCopySubLengths_B
,
InBlockCopySubLengths_N2
,
EPACK
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2_EPACK
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_N1
,
InBlockCopyClusterLengths_B
,
InBlockCopyClusterLengths_N2
,
1
>
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
CK_PARAM_IN_BLOCK_COPY_DST_DATA_PER_WRITE_N2
;
// EPACK - E dimension is folded into 2 dimensions E and EPACK
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
,
4
>
;
// [E, N1, N2, B, EPACK]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
,
4
>
;
// [E, N1, N2, B, EPACK]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
>
;
// [E, N1, B, N2, EPACK]
using
WeiBlockCopySubLengths_E_K_EPACK
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
,
EPACK
>
;
using
WeiBlockCopyClusterLengths_E_K_EPACK
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
,
1
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPACK]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPACK]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, K, EPACK]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATE_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#else
static_assert
(
false
,
"wrong! Only kperblock could be 32/64/128 not supported"
);
#endif
#if MIOPEN_USE_FP32
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmNRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
dir
>
{};
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmNRepeat
,
EPACK
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N1_B_N2_EPACK
,
InBlockCopyClusterLengths_E_N1_B_N2_EPACK
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_E_K_EPACK
,
WeiBlockCopyClusterLengths_E_K_EPACK
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
dir
>
{};
#else
static_assert
(
false
,
"wrong! Only fp32, fp16 and bfp16 are supported."
);
#endif
gridwise_conv
.
Run
(
p_in_global
,
p_wei_global
,
p_out_global
);
}
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.cpp
0 → 100644
View file @
32850b93
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
#include "implicitgemm_params.hpp"
extern
"C"
__global__
__launch_bounds__
(
CK_PARAM_TUNABLE_BLOCK_SIZE
,
2
)
void
gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer
(
const
FLOAT
*
const
__restrict__
p_in_global
,
const
FLOAT
*
const
__restrict__
p_wei_global
,
FLOAT
*
const
__restrict__
p_out_global
)
{
using
namespace
ck
;
// read params: problem decription
constexpr
index_t
N
=
CK_PARAM_PROBLEM_N
;
constexpr
index_t
K
=
CK_PARAM_PROBLEM_K
;
constexpr
index_t
C
=
CK_PARAM_PROBLEM_C
;
constexpr
index_t
Hi
=
CK_PARAM_PROBLEM_HI
;
constexpr
index_t
Wi
=
CK_PARAM_PROBLEM_WI
;
constexpr
index_t
Ho
=
CK_PARAM_PROBLEM_HO
;
constexpr
index_t
Wo
=
CK_PARAM_PROBLEM_WO
;
constexpr
index_t
ConvStrideH
=
CK_PARAM_PROBLEM_CONV_STRIDE_H
;
constexpr
index_t
ConvStrideW
=
CK_PARAM_PROBLEM_CONV_STRIDE_W
;
// read params: tunable params
constexpr
index_t
BlockSize
=
CK_PARAM_TUNABLE_BLOCK_SIZE
;
constexpr
index_t
BPerBlock
=
CK_PARAM_TUNABLE_B_PER_BLOCK
;
constexpr
index_t
KPerBlock
=
CK_PARAM_TUNABLE_K_PER_BLOCK
;
constexpr
index_t
EPerBlock
=
CK_PARAM_TUNABLE_E_PER_BLOCK
;
// read params: dependent params
constexpr
index_t
GridSize
=
CK_PARAM_DEPENDENT_GRID_SIZE
;
// calculate dependent params amd heuristic params
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
wei_ck_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
1
,
C
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
using
ConvStrides
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
constexpr
index_t
InBlockCopyClusterLengths_E
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
InBlockCopyClusterLengths_B
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B
;
constexpr
index_t
InBlockCopySubLengths_E
=
EPerBlock
/
InBlockCopyClusterLengths_E
;
constexpr
index_t
InBlockCopySubLengths_B
=
BPerBlock
/
InBlockCopyClusterLengths_B
;
constexpr
index_t
WeiBlockCopyClusterLengths_E
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K
;
constexpr
index_t
WeiBlockCopySubLengths_E
=
EPerBlock
/
WeiBlockCopyClusterLengths_E
;
constexpr
index_t
WeiBlockCopySubLengths_K
=
KPerBlock
/
WeiBlockCopyClusterLengths_K
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_B
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_B
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
InBlockCopyDataPerAccess_B
=
CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B
;
constexpr
auto
GemmMPerWave
=
CK_PARAM_GEMM_M_PER_WAVE
;
constexpr
auto
GemmNPerWave
=
CK_PARAM_GEMM_N_PER_WAVE
;
constexpr
auto
GemmMWaves
=
KPerBlock
/
GemmMPerWave
;
constexpr
auto
GemmNWaves
=
BPerBlock
/
GemmNPerWave
;
constexpr
auto
GemmDataPerReadA
=
1
;
constexpr
auto
GemmDataPerReadB
=
1
;
constexpr
auto
EnableXdlops
=
CK_ENABLE_XDLOPS
==
1
;
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
decltype
(
in_nchw_desc
),
decltype
(
wei_ck_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
static_cast
<
ImplicitGemmDirection
>
(
CK_PARAM_PROBLEM_DIRECTION
),
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerWave
,
GemmNPerWave
,
GemmMWaves
,
GemmNWaves
,
GemmDataPerReadA
,
GemmDataPerReadB
,
EnableXdlops
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
>
{};
gridwise_conv
.
Run
(
p_in_global
,
p_wei_global
,
p_out_global
);
}
composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
0 → 100644
View file @
32850b93
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "float_types.h"
extern
"C"
__global__
__launch_bounds__
(
CK_PARAM_TUNABLE_BLOCK_SIZE
,
2
)
void
gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer
(
const
FLOAT
*
const
__restrict__
p_in_global
,
const
FLOAT
*
const
__restrict__
p_wei_global
,
FLOAT
*
const
__restrict__
p_out_global
)
{
using
namespace
ck
;
// read params: problem decription
constexpr
index_t
N
=
CK_PARAM_PROBLEM_N
;
constexpr
index_t
K
=
CK_PARAM_PROBLEM_K
;
constexpr
index_t
C
=
CK_PARAM_PROBLEM_C
;
constexpr
index_t
Hi
=
CK_PARAM_PROBLEM_HI
;
constexpr
index_t
Wi
=
CK_PARAM_PROBLEM_WI
;
constexpr
index_t
Ho
=
CK_PARAM_PROBLEM_HO
;
constexpr
index_t
Wo
=
CK_PARAM_PROBLEM_WO
;
constexpr
index_t
Y
=
CK_PARAM_PROBLEM_Y
;
constexpr
index_t
X
=
CK_PARAM_PROBLEM_X
;
constexpr
index_t
ConvStrideH
=
CK_PARAM_PROBLEM_CONV_STRIDE_H
;
constexpr
index_t
ConvStrideW
=
CK_PARAM_PROBLEM_CONV_STRIDE_W
;
constexpr
index_t
ConvDilationH
=
CK_PARAM_PROBLEM_CONV_DILATION_H
;
constexpr
index_t
ConvDilationW
=
CK_PARAM_PROBLEM_CONV_DILATION_W
;
// read params: tunable params
constexpr
index_t
BlockSize
=
CK_PARAM_TUNABLE_BLOCK_SIZE
;
constexpr
index_t
BPerBlock
=
CK_PARAM_TUNABLE_B_PER_BLOCK
;
constexpr
index_t
KPerBlock
=
CK_PARAM_TUNABLE_K_PER_BLOCK
;
constexpr
index_t
EPerBlock
=
CK_PARAM_TUNABLE_E_PER_BLOCK
;
// read params: dependent params
constexpr
index_t
GridSize
=
CK_PARAM_DEPENDENT_GRID_SIZE
;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
constexpr
auto
tmp_in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
tmp_wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
tmp_out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
in_nchw_desc
=
tmp_in_nchw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
// wei and out are swapped in the solver
constexpr
auto
wei_kcyx_desc
=
tmp_out_nkhw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
out_nkhw_desc
=
tmp_wei_kcyx_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
BackwardWeight
;
// swap stride and dilation
using
ConvDilations
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
using
ConvStrides
=
Sequence
<
ConvDilationH
,
ConvDilationW
>
;
#else
// calculate dependent params amd heuristic params
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
ForwardData
;
using
ConvStrides
=
Sequence
<
ConvStrideH
,
ConvStrideW
>
;
using
ConvDilations
=
Sequence
<
ConvDilationH
,
ConvDilationW
>
;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr
index_t
InBlockCopyClusterLengths_E
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
InBlockCopyClusterLengths_B
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B
;
constexpr
index_t
InBlockCopySubLengths_E
=
EPerBlock
/
InBlockCopyClusterLengths_E
;
constexpr
index_t
InBlockCopySubLengths_B
=
BPerBlock
/
InBlockCopyClusterLengths_B
;
constexpr
index_t
WeiBlockCopyClusterLengths_E
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K
;
constexpr
index_t
WeiBlockCopySubLengths_E
=
EPerBlock
/
WeiBlockCopyClusterLengths_E
;
constexpr
index_t
WeiBlockCopySubLengths_K
=
KPerBlock
/
WeiBlockCopyClusterLengths_K
;
constexpr
index_t
EPack
=
CK_PARAM_EPACK_LENGTH
;
#if MIOPEN_USE_FP32
using
InBlockCopySubLengths_E_B
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_B
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_B
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
using
InBlockCopySubLengths_E_B
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_B
,
EPack
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_B
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
,
EPack
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
,
1
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPack]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPack]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, K, EPack]
#endif
constexpr
index_t
InBlockCopyDataPerAccess_B
=
CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B
;
constexpr
auto
GemmMPerWave
=
CK_PARAM_GEMM_M_PER_WAVE
;
constexpr
auto
GemmNPerWave
=
CK_PARAM_GEMM_N_PER_WAVE
;
constexpr
auto
GemmMWaves
=
KPerBlock
/
GemmMPerWave
;
constexpr
auto
GemmNWaves
=
BPerBlock
/
GemmNPerWave
;
constexpr
index_t
GemmDataPerReadA
=
1
;
constexpr
index_t
GemmDataPerReadB
=
1
;
constexpr
auto
gridwise_conv
=
#if MIOPEN_USE_FP32
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
EPack
,
GemmMPerWave
,
GemmNPerWave
,
GemmMWaves
,
GemmNWaves
,
GemmDataPerReadA
,
GemmDataPerReadB
,
(
CK_ENABLE_XDLOPS
==
1
),
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
,
dir
>
{};
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
EPack
,
GemmMPerWave
,
GemmNPerWave
,
GemmMWaves
,
GemmNWaves
,
GemmDataPerReadA
,
GemmDataPerReadB
,
(
CK_ENABLE_XDLOPS
==
1
),
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
,
dir
>
{};
#else
static_assert
(
false
,
"wrong! Only fp32, fp16 and bfp16 are supported."
);
#endif
gridwise_conv
.
Run
(
p_in_global
,
p_wei_global
,
p_out_global
);
}
driver/include/conv_common.hpp
View file @
32850b93
...
@@ -32,6 +32,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
...
@@ -32,6 +32,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
1
-
X
;
constexpr
auto
WO
=
WI
+
1
-
X
;
printf
(
"H0=%d, W0=%d
\n
"
,
HO
,
WO
);
return
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
...
...
driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
View file @
32850b93
#pragma once
#pragma once
#include <unistd.h>
#include <unistd.h>
#define MIOPEN_USE_FP16
1
#define MIOPEN_USE_FP16
0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32
0
#define MIOPEN_USE_FP32
1
#define __HIP_PLATFORM_HCC__ 1
#define __HIP_PLATFORM_HCC__ 1
...
...
driver/include/device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
32850b93
#pragma once
#include <unistd.h>
#define MIOPEN_USE_FP16 0
#define MIOPEN_USE_BFP16 0
#define MIOPEN_USE_FP32 1
#define __HIP_PLATFORM_HCC__ 1
#include "float_types.h"
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
//#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#define CK_ENABLE_XDLOPS 0
#define CK_PARAM_PROBLEM_DIRECTION 0
#define CK_PARAM_EPACK_LENGTH 1
#define CK_PARAM_TUNABLE_BLOCK_SIZE 64
#define CK_PARAM_TUNABLE_K_PER_BLOCK 32
#define CK_PARAM_TUNABLE_B_PER_BLOCK 64
#define CK_PARAM_TUNABLE_E_PER_BLOCK 8
#define CK_PARAM_DEPENDENT_GRID_SIZE 16
#define CK_PARAM_GEMM_M_PER_WAVE 32
#define CK_PARAM_GEMM_N_PER_WAVE 64
#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E 8
#define CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B 8
#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E 4
#define CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K 16
#define CK_PARAM_PROBLEM_CONV_DILATION_W 1
#define CK_PARAM_PROBLEM_CONV_DILATION_H 1
#define CK_PARAM_PROBLEM_CONV_STRIDE_H 1
#define CK_PARAM_PROBLEM_CONV_STRIDE_W 1
#define CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B 1
#define CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E 2
#define CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K 2
#define CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B 1
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
{
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc_org
=
InDesc
{};
constexpr
auto
wei_kcyx_desc_org
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc_org
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc_org
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc_org
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc_org
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc_org
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc_org
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc_org
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc_org
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc_org
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc_org
.
GetLength
(
I3
);
constexpr
index_t
ConvStrideH
=
CK_PARAM_PROBLEM_CONV_STRIDE_H
;
constexpr
index_t
ConvStrideW
=
CK_PARAM_PROBLEM_CONV_STRIDE_W
;
constexpr
index_t
ConvDilationH
=
CK_PARAM_PROBLEM_CONV_DILATION_H
;
constexpr
index_t
ConvDilationW
=
CK_PARAM_PROBLEM_CONV_DILATION_W
;
// read params: tunable params
constexpr
index_t
BlockSize
=
CK_PARAM_TUNABLE_BLOCK_SIZE
;
constexpr
index_t
BPerBlock
=
CK_PARAM_TUNABLE_B_PER_BLOCK
;
constexpr
index_t
KPerBlock
=
CK_PARAM_TUNABLE_K_PER_BLOCK
;
constexpr
index_t
EPerBlock
=
CK_PARAM_TUNABLE_E_PER_BLOCK
;
// read params: dependent params
constexpr
index_t
GridSize
=
CK_PARAM_DEPENDENT_GRID_SIZE
;
// calculate dependent params amd heuristic params
#if CK_PARAM_PROBLEM_DIRECTION == 2
// In the WrW direction the filter is the output, while the output image is the input being
// convolved with the (original) input image. This requires that the tensordescriptors be
// swapped
// To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
// input descriptor, the n and k dimension of the output descriptor
// This change is necessary so that reduction dimensions are consistent with the requirement
// of the wrw convolution when used in a fwd context
printf
(
"backward weight is executed
\n
"
);
constexpr
auto
tmp_in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
tmp_wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
tmp_out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
in_nchw_desc
=
tmp_in_nchw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
// wei and out are swapped in the solver
constexpr
auto
wei_kcyx_desc
=
tmp_out_nkhw_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
out_nkhw_desc
=
tmp_wei_kcyx_desc
.
ReorderGivenNew2Old
(
Sequence
<
1
,
0
,
2
,
3
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
BackwardWeight
;
// swap stride and dilation
// using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
// using ConvStrides = Sequence<ConvDilationH, ConvDilationW>;
#else
printf
(
"forward data is executed
\n
"
);
// calculate dependent params amd heuristic params
constexpr
auto
in_nchw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
C
,
Hi
,
Wi
>
{});
constexpr
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
N
,
K
,
Ho
,
Wo
>
{});
constexpr
auto
dir
=
ImplicitGemmDirection
::
ForwardData
;
// using ConvStrides = Sequence<ConvStrideH, ConvStrideW>;
// using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
#endif // CK_PARAM_PROBLEM_DIRECTION == 2
constexpr
index_t
InBlockCopyClusterLengths_E
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
InBlockCopyClusterLengths_B
=
CK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B
;
constexpr
index_t
InBlockCopySubLengths_E
=
EPerBlock
/
InBlockCopyClusterLengths_E
;
constexpr
index_t
InBlockCopySubLengths_B
=
BPerBlock
/
InBlockCopyClusterLengths_B
;
constexpr
index_t
WeiBlockCopyClusterLengths_E
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E
;
constexpr
index_t
WeiBlockCopyClusterLengths_K
=
CK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K
;
constexpr
index_t
WeiBlockCopySubLengths_E
=
EPerBlock
/
WeiBlockCopyClusterLengths_E
;
constexpr
index_t
WeiBlockCopySubLengths_K
=
KPerBlock
/
WeiBlockCopyClusterLengths_K
;
constexpr
index_t
EPack
=
CK_PARAM_EPACK_LENGTH
;
#if MIOPEN_USE_FP32
printf
(
"fp32 is executed
\n
"
);
using
InBlockCopySubLengths_E_B
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_B
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_B
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
using
InBlockCopySubLengths_E_B
=
Sequence
<
InBlockCopySubLengths_E
,
InBlockCopySubLengths_B
,
EPack
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
InBlockCopyClusterLengths_E
,
InBlockCopyClusterLengths_B
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, B, EPack]
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
WeiBlockCopySubLengths_E
,
WeiBlockCopySubLengths_K
,
EPack
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
WeiBlockCopyClusterLengths_E
,
WeiBlockCopyClusterLengths_K
,
1
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPack]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
,
2
>
;
// [K, E, EPack]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
// [E, K, EPack]
#endif
constexpr
index_t
InBlockCopyDataPerAccess_B
=
CK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
CK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
CK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
CK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B
;
constexpr
auto
GemmMPerWave
=
CK_PARAM_GEMM_M_PER_WAVE
;
constexpr
auto
GemmNPerWave
=
CK_PARAM_GEMM_N_PER_WAVE
;
constexpr
auto
GemmMWaves
=
KPerBlock
/
GemmMPerWave
;
constexpr
auto
GemmNWaves
=
BPerBlock
/
GemmNPerWave
;
constexpr
index_t
GemmDataPerReadA
=
1
;
constexpr
index_t
GemmDataPerReadB
=
1
;
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
// #if MIOPEN_USE_FP16 == 1
// // ES set to 4 as dot4 operator is supported on fp16 in MI100
// constexpr index_t ES = 4;
// #elif MIOPEN_USE_BFP16 == 1
// // ES set to 2 as dot2 operator is supported on bfp16 in MI100
// constexpr index_t ES = 2;
// #else
// // do nothing
// #endif
// constexpr index_t GridSize =
// ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if MIOPEN_USE_FP32 == 1
GridwiseConvolutionImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
EPack
,
GemmMPerWave
,
GemmNPerWave
,
GemmMWaves
,
GemmNWaves
,
GemmDataPerReadA
,
GemmDataPerReadB
,
false
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
,
dir
>
{};
#elif MIOPEN_USE_FP16 == 1 || MIOPEN_USE_BFP16 == 1
GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer
<
GridSize
,
BlockSize
,
FLOAT
,
FLOAT_ACCUM
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
EPack
,
GemmMPerWave
,
GemmNPerWave
,
GemmMWaves
,
GemmNWaves
,
GemmDataPerReadA
,
GemmDataPerReadB
,
false
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
,
dir
>
{};
#endif
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/src/driver.cpp
View file @
32850b93
...
@@ -8,11 +8,12 @@
...
@@ -8,11 +8,12 @@
#include "device.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
// #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
// #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
// #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
// #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -400,6 +401,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -400,6 +401,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
float
ref_value
=
0
,
result_value
=
0
;
float
ref_value
=
0
,
result_value
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
std
::
cout
<<
result
.
mData
[
i
]
<<
" "
;
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
error
+=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
if
(
max_diff
<
diff
)
...
@@ -410,6 +412,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -410,6 +412,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
}
}
}
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
}
}
...
@@ -803,7 +806,7 @@ int main(int argc, char* argv[])
...
@@ -803,7 +806,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
#elif 1
constexpr
index_t
N
=
8
;
constexpr
index_t
N
=
32
;
constexpr
index_t
C
=
64
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
4
;
constexpr
index_t
HI
=
4
;
constexpr
index_t
WI
=
4
;
constexpr
index_t
WI
=
4
;
...
@@ -830,8 +833,8 @@ int main(int argc, char* argv[])
...
@@ -830,8 +833,8 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
using
in_data_t
=
half
;
using
in_data_t
=
float
;
using
out_data_t
=
half
;
using
out_data_t
=
float
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
...
@@ -850,7 +853,7 @@ int main(int argc, char* argv[])
...
@@ -850,7 +853,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
0
#if
1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif 0
#elif 0
...
@@ -859,7 +862,7 @@ int main(int argc, char* argv[])
...
@@ -859,7 +862,7 @@ int main(int argc, char* argv[])
#elif 0
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif
1
#elif
0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
#elif 0
...
@@ -883,8 +886,10 @@ int main(int argc, char* argv[])
...
@@ -883,8 +886,10 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif
1
#elif
0
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
#elif 1
device_convolution_implicit_gemm_v5_nchw_kcyx_nkhw
#endif
#endif
(
in_nchw_desc
,
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment