Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
20fa988f
Commit
20fa988f
authored
Feb 23, 2021
by
Chao Liu
Browse files
add fwd-v4r4-nhwc, change vector_type
parent
1c62b47b
Changes
6
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1189 additions
and
159 deletions
+1189
-159
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+683
-0
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+45
-0
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+188
-158
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+3
-0
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+256
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+14
-1
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
0 → 100644
View file @
20fa988f
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
20fa988f
...
@@ -168,6 +168,51 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
...
@@ -168,6 +168,51 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
#endif
#endif
}
}
template
<
>
__device__
float8_t
amd_buffer_load_v2
<
float
,
8
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_buffer_resource
;
// wavewise base address (64 bit)
src_wave_buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
vector_type
<
float
,
8
>
vector
;
vector
.
Set
(
Number
<
4
>
{},
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
vector
.
Set
(
Number
<
4
>
{},
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
+
4
*
sizeof
(
float
),
0
,
0
);
return
vector
.
Get
(
Number
<
8
>
{},
Number
<
0
>
{});
#else
vector_type
<
float
,
8
>
vector
;
vector
.
Set
(
Number
<
4
>
{},
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
vector
.
Set
(
Number
<
4
>
{},
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
+
4
*
sizeof
(
float
),
0
,
0
);
return
src_thread_data_valid
?
vector
.
Get
(
Number
<
8
>
{},
Number
<
0
>
{})
:
float8_t
(
0
);
#endif
}
template
<
>
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
1
>
(
const
float
src_thread_data
,
__device__
void
amd_buffer_store_v2
<
float
,
1
>
(
const
float
src_thread_data
,
float
*
p_dst_wave
,
float
*
p_dst_wave
,
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
20fa988f
...
@@ -4,19 +4,20 @@
...
@@ -4,19 +4,20 @@
namespace ck {
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA
// For some reason, HIP compiler need this definition to generate optimal ISA
// f
loat
// f
p32
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float8_t __attribute__((ext_vector_type(8)));
typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float32_t __attribute__((ext_vector_type(32)));
typedef float float32_t __attribute__((ext_vector_type(32)));
// f
loat
16
// f
p
16
typedef _Float16 half_t;
typedef _Float16 half_t;
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
// bf
loat
16
// bf
p
16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
...
@@ -168,23 +169,17 @@ struct c_vec4_1_t
...
@@ -168,23 +169,17 @@ struct c_vec4_1_t
}
}
};
};
template <class T, index_t N>
template <typename T, index_t N>
struct vector_type
struct vector_type;
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
template <
typename T
>
struct vector_type<
float
, 1>
struct vector_type<
T
, 1>
{
{
using MemoryType =
float
;
using MemoryType =
T
;
float
data_;
T
data_;
__host__ __device__ constexpr vector_type() : data_{
0
} {}
__host__ __device__ constexpr vector_type() : data_{
T{0}
} {}
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ static constexpr index_t Size() { return 1; }
...
@@ -192,6 +187,22 @@ struct vector_type<float, 1>
...
@@ -192,6 +187,22 @@ struct vector_type<float, 1>
__host__ __device__ constexpr auto& Vector() { return data_; }
__host__ __device__ constexpr auto& Vector() { return data_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_;
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
}
template <index_t I>
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
{
...
@@ -209,31 +220,66 @@ struct vector_type<float, 1>
...
@@ -209,31 +220,66 @@ struct vector_type<float, 1>
}
}
};
};
template <>
template <
typename T
>
struct vector_type<
float
, 2>
struct vector_type<
T
, 2>
{
{
using MemoryType = float2_t;
using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
using MemoryType = d2_t;
union
union
{
{
float2_t vector
_;
d2_t d2
_;
StaticallyIndexedArray<
floa
t, 2>
scalars
_;
StaticallyIndexedArray<
d1_
t, 2>
d1x2
_;
} data_;
} data_;
__host__ __device__ constexpr vector_type() : data_{
MemoryType
{0}} {}
__host__ __device__ constexpr vector_type() : data_{
d2_t
{0}} {}
__host__ __device__ static constexpr index_t Size() { return 2; }
__host__ __device__ static constexpr index_t Size() { return 2; }
__host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
__host__ __device__ constexpr auto& Vector() { return data_.d2_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_[i];
}
__host__ __device__ constexpr auto& Vector() { return data_.vector_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<2>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_.d2_;
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_(i);
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_.d2_;
}
template <index_t I>
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
{
static_assert(I >= 0 && I < 2, "wrong!");
static_assert(I >= 0 && I < 2, "wrong!");
return data_.
scalars
_[Number<I>{}];
return data_.
d1x2
_[Number<I>{}];
}
}
template <index_t I>
template <index_t I>
...
@@ -241,219 +287,203 @@ struct vector_type<float, 2>
...
@@ -241,219 +287,203 @@ struct vector_type<float, 2>
{
{
static_assert(I >= 0 && I < 2, "wrong!");
static_assert(I >= 0 && I < 2, "wrong!");
return data_.
scalars
_(Number<I>{});
return data_.
d1x2
_(Number<I>{});
}
}
};
};
template <>
template <
typename T
>
struct vector_type<
float
, 4>
struct vector_type<
T
, 4>
{
{
using MemoryType = float4_t;
using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
using MemoryType = d4_t;
union
union
{
{
float4_t vector_;
d4_t d4_;
StaticallyIndexedArray<float, 4> scalars_;
StaticallyIndexedArray<d1_t, 4> d1x4_;
StaticallyIndexedArray<d2_t, 2> d2x2_;
} data_;
} data_;
__host__ __device__ constexpr vector_type() : data_{
MemoryType
{0}} {}
__host__ __device__ constexpr vector_type() : data_{
d4_t
{0}} {}
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.
vector
_; }
__host__ __device__ constexpr const auto& Vector() const { return data_.
d4
_; }
__host__ __device__ constexpr auto& Vector() { return data_.
vector
_; }
__host__ __device__ constexpr auto& Vector() { return data_.
d4
_; }
template <index_t I>
template <index_t I>
__host__ __device__ constexpr const auto&
operator[](Number<I>
) const
__host__ __device__ constexpr const auto&
Get(Number<1>, Number<I> i
) const
{
{
static_assert(I >= 0 && I < 4, "wrong!");
static_assert(I >= 0 && I < 4, "wrong!");
return data_.
scalars_[Number<I>{}
];
return data_.
d1x4_[i
];
}
}
template <index_t I>
template <index_t I>
__host__ __device__ constexpr
auto& operator()(Number<I>)
__host__ __device__ constexpr
const auto& Get(Number<2>, Number<I> i) const
{
{
static_assert(I >= 0 && I <
4
, "wrong!");
static_assert(I >= 0 && I <
2
, "wrong!");
return data_.
scalars_(Number<I>{})
;
return data_.
d2x2_[i]
;
}
}
};
template <>
struct vector_type<half_t, 1>
{
using MemoryType = half_t;
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, half_t s
, Number<I>)
__host__ __device__
constexpr const auto& Get(Number<4>
, Number<I>)
const
{
{
static_assert(I < 1, "wrong");
static_assert(I == 0, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <>
struct vector_type<half_t, 2>
{
using MemoryType = half2_t;
union DataType
return data_.d4_;
{
}
MemoryType vector;
half_t scalar[2];
};
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, half_t s
, Number<I>)
__host__ __device__
constexpr auto& Set(Number<1>
, Number<I>
i
)
{
{
static_assert(I < 2, "wrong");
static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
return data_.d1x4_(i);
}
}
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{
{
DataType data;
static_assert(I >= 0 && I < 3, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
return data_.d2x2_(i);
struct vector_type<half_t, 4>
}
{
using MemoryType = half4_t;
union DataType
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<4>, Number<I>)
{
{
MemoryType vector;
static_assert(I == 0, "wrong!");
half_t scalar[4];
};
return data_.d4_;
}
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, half_t s,
Number<I>)
__host__ __device__
constexpr const auto& operator[](
Number<I>)
const
{
{
static_assert(I < 4, "wrong");
static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
return data_.d1x4_[Number<I>{}];
}
}
__host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3)
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
{
DataType data;
static_assert(I >= 0 && I < 4, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data_.d1x4_(Number<I>{});
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
}
};
};
template <>
template <
typename T
>
struct vector_type<
half_t
, 8>
struct vector_type<
T
, 8>
{
{
using MemoryType = half8_t;
using d1_t = T;
typedef T d2_t __attribute__((ext_vector_type(2)));
typedef T d4_t __attribute__((ext_vector_type(4)));
typedef T d8_t __attribute__((ext_vector_type(8)));
using MemoryType = d8_t;
union
DataType
union
{
{
MemoryType vector;
d8_t d8_;
half_t scalar[8];
StaticallyIndexedArray<d1_t, 8> d1x8_;
};
StaticallyIndexedArray<d2_t, 4> d2x4_;
StaticallyIndexedArray<d4_t, 2> d4x2_;
} data_;
__host__ __device__ constexpr vector_type() : data_{d8_t{0}} {}
__host__ __device__ static constexpr index_t Size() { return 8; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d8_; }
__host__ __device__ constexpr auto& Vector() { return data_.d8_; }
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, half_t s
, Number<I>
)
__host__ __device__
constexpr const auto& Get(Number<1>
, Number<I>
i) const
{
{
static_assert(I < 8, "wrong");
static_assert(I >= 0 && I < 8, "wrong!");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <>
return data_.d1x8_[i];
struct vector_type<ushort, 1>
}
{
using MemoryType = ushort;
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, ushort s
, Number<I>
)
__host__ __device__
constexpr const auto& Get(Number<2>
, Number<I>
i) const
{
{
static_assert(I < 1, "wrong");
static_assert(I >= 0 && I < 4, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
template <>
return data_.d2x4_[i];
struct vector_type<ushort, 2>
}
{
using MemoryType = ushort2_t;
union DataType
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<4>, Number<I> i) const
{
{
MemoryType vector;
static_assert(I >= 0 && I < 2, "wrong!");
ushort scalar[2];
};
return data_.d4x2_[i];
}
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, ushort s
, Number<I>)
__host__ __device__
constexpr const auto& Get(Number<8>
, Number<I>)
const
{
{
static_assert(I < 2, "wrong");
static_assert(I == 0, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d8_;
}
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
{
DataType data;
static_assert(I >= 0 && I < 8, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
return data_.d1x8_(i);
struct vector_type<ushort, 4>
}
{
using MemoryType = ushort4_t;
union DataType
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{
{
MemoryType vector;
static_assert(I >= 0 && I < 4, "wrong!");
ushort scalar[4];
};
return data_.d2x4_(i);
}
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, ushort s
, Number<I>)
__host__ __device__
constexpr auto& Set(Number<4>
, Number<I>
i
)
{
{
static_assert(I < 4, "wrong");
static_assert(I >= 0 && I < 2, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d4x2_(i);
}
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<8>, Number<I> i)
{
{
DataType data;
static_assert(I == 0, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
return data_.d8_;
struct vector_type<ushort, 8>
}
{
using MemoryType = ushort8_t;
union DataType
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
{
MemoryType vector;
static_assert(I >= 0 && I < 8, "wrong!");
ushort scalar[8];
};
return data_.d1x8_[Number<I>{}];
}
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, ushort s,
Number<I>)
__host__ __device__
constexpr auto& operator()(
Number<I>)
{
{
static_assert(I < 8, "wrong");
static_assert(I >= 0 && I < 8, "wrong!");
*(reinterpret_cast<ushort*>(&v) + I) = s;
return data_.d1x8_(Number<I>{});
}
}
};
};
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
20fa988f
...
@@ -23,6 +23,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -23,6 +23,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
InRightPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
std
::
cout
<<
"device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
<<
std
::
endl
;
using
namespace
ck
;
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
0 → 100644
View file @
20fa988f
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
std
::
cout
<<
"device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk"
<<
std
::
endl
;
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
N
=
OutDesc
::
GetLengths
()[
I0
];
constexpr
auto
K
=
OutDesc
::
GetLengths
()[
I1
];
constexpr
auto
C
=
WeiDesc
::
GetLengths
()[
I1
];
constexpr
auto
Hi
=
InDesc
::
GetLengths
()[
I2
];
constexpr
auto
Wi
=
InDesc
::
GetLengths
()[
I3
];
constexpr
auto
Ho
=
OutDesc
::
GetLengths
()[
I2
];
constexpr
auto
Wo
=
OutDesc
::
GetLengths
()[
I3
];
constexpr
auto
Y
=
WeiDesc
::
GetLengths
()[
I2
];
constexpr
auto
X
=
WeiDesc
::
GetLengths
()[
I3
];
#if 1
// run-time variables
constexpr
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_multi_index
(
N
,
Hi
,
Wi
,
C
));
constexpr
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_multi_index
(
K
,
Y
,
X
,
C
));
constexpr
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_multi_index
(
N
,
Ho
,
Wo
,
K
));
const
auto
conv_strides
=
to_multi_index
(
ConvStrides
{});
const
auto
conv_dilations
=
to_multi_index
(
ConvDilations
{});
const
auto
in_left_pads
=
to_multi_index
(
InLeftPads
{});
const
auto
in_right_pads
=
to_multi_index
(
InRightPads
{});
#else
// compile-time variables
constexpr
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
Hi
,
Wi
,
C
));
constexpr
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
Y
,
X
,
C
));
constexpr
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
Ho
,
Wo
,
K
));
const
auto
conv_strides
=
sequence_to_tuple_of_number
(
ConvStrides
{});
const
auto
conv_dilations
=
sequence_to_tuple_of_number
(
ConvDilations
{});
const
auto
in_left_pads
=
sequence_to_tuple_of_number
(
InLeftPads
{});
const
auto
in_right_pads
=
sequence_to_tuple_of_number
(
InRightPads
{});
#endif
Tensor
<
float
>
in_nhwc
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Hi
,
Wi
,
C
>
{})));
Tensor
<
float
>
wei_kyxc
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
Y
,
X
,
C
>
{})));
Tensor
<
float
>
out_nhwk
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Ho
,
Wo
,
K
>
{})));
auto
f_nchw2nhwc
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_nhwc
(
n
,
hi
,
wi
,
c
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kyxc
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_kyxc
(
k
,
y
,
x
,
c
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
};
auto
f_nkhw2nhwk
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
out_nhwk
(
n
,
ho
,
wo
,
k
)
=
out_nkhw
(
n
,
k
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_nchw2nhwc
,
N
,
Hi
,
Wi
,
C
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f_kcyx2kyxc
,
K
,
Y
,
X
,
C
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f_nkhw2nhwk
,
N
,
Ho
,
Wo
,
K
)(
std
::
thread
::
hardware_concurrency
());
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nhwc_device_buf
(
data_sz
*
in_nhwc
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kyxc_device_buf
(
data_sz
*
wei_kyxc
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nhwk_device_buf
(
data_sz
*
out_nhwk
.
mDesc
.
GetElementSpace
());
in_nhwc_device_buf
.
ToDevice
(
in_nhwc
.
mData
.
data
());
wei_kyxc_device_buf
.
ToDevice
(
wei_kyxc
.
mData
.
data
());
out_nhwk_device_buf
.
ToDevice
(
out_nhwk
.
mData
.
data
());
#if 0
// cdata = 64, BlockSize = 128, 32x256x8
constexpr index_t BlockSize = 128;
constexpr index_t GemmMPerBlock = 32;
constexpr index_t GemmNPerBlock = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 2;
constexpr index_t GemmNLevel1Cluster = 16;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<2, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<8, 2>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif
0
// cdata = 64, BlockSize = 256, 128x128x8
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
8
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x16
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
8
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
2
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
64
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
2
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
8
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmK
=
8
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmM1
=
4
;
#endif
constexpr
auto
conv_driver
=
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_no_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk_1x1
#endif
<
BlockSize
,
TDevice
,
TDevice
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
GemmABlockTransferSrcScalarPerVector_GemmK
,
GemmABlockTransferDstScalarPerVector_GemmM
,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
GemmBBlockTransferSrcScalarPerVector_GemmK
,
GemmBBlockTransferDstScalarPerVector_GemmN
,
GemmCThreadTransferDstScalarPerVector_GemmM1
>
{};
conv_driver
.
Run
(
wei_k_y_x_c_desc
,
in_n_hi_wi_c_desc
,
out_n_ho_wo_k_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
TDevice
*>
(
wei_kyxc_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TDevice
*>
(
in_nhwc_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TDevice
*>
(
out_nhwk_device_buf
.
GetDeviceBuffer
()));
out_nhwk_device_buf
.
FromDevice
(
out_nhwk
.
mData
.
data
());
auto
f_nhwk2nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_nhwk
(
n
,
ho
,
wo
,
k
);
};
make_ParallelTensorFunctor
(
f_nhwk2nkhw
,
N
,
K
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
}
driver/src/conv_driver.cpp
View file @
20fa988f
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
@@ -615,7 +616,7 @@ int main(int argc, char* argv[])
...
@@ -615,7 +616,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -627,6 +628,18 @@ int main(int argc, char* argv[])
...
@@ -627,6 +628,18 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#endif
#endif
if
(
do_verification
)
if
(
do_verification
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment