Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5e6cca6f
Commit
5e6cca6f
authored
Apr 26, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
afc7d431
3956085d
Changes
62
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1159 additions
and
447 deletions
+1159
-447
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+15
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+17
-0
include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
...k/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+1
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+6
-3
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+2
-0
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+0
-71
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+8
-5
include/ck/utility/generic_memory_space_atomic_add.hpp
include/ck/utility/generic_memory_space_atomic_add.hpp
+44
-0
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+53
-3
include/ck/utility/reduction_common.hpp
include/ck/utility/reduction_common.hpp
+2
-2
include/ck/utility/transpose_vectors.hpp
include/ck/utility/transpose_vectors.hpp
+82
-1
library/CMakeLists.txt
library/CMakeLists.txt
+2
-1
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+7
-30
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+13
-12
library/include/ck/library/utility/conv_fwd_util.hpp
library/include/ck/library/utility/conv_fwd_util.hpp
+323
-306
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+81
-0
library/include/ck/library/utility/op_instance_engine.hpp
library/include/ck/library/utility/op_instance_engine.hpp
+231
-0
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
...vice_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+13
-13
library/src/utility/CMakeLists.txt
library/src/utility/CMakeLists.txt
+21
-0
library/src/utility/conv_fwd_util.cpp
library/src/utility/conv_fwd_util.cpp
+238
-0
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
View file @
5e6cca6f
...
...
@@ -45,6 +45,7 @@ __global__ void
const
CElementwiseOperation
c_element_op
,
const
Block2CTileMap
block_2_ctile_map
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainK0BlockLoop
>(
...
...
@@ -61,6 +62,20 @@ __global__ void
b_element_op
,
c_element_op
,
block_2_ctile_map
);
#else
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_c_grid
;
ignore
=
p_c0_grid
;
ignore
=
a_grid_desc_k0_m_k1
;
ignore
=
b_grid_desc_k0_n_k1
;
ignore
=
c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
;
ignore
=
c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
c_element_op
;
ignore
=
block_2_ctile_map
;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template
<
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
View file @
5e6cca6f
...
...
@@ -49,6 +49,7 @@ __global__ void
const
CElementwiseOperation
c_element_op
,
const
Block2CTileMap
block_2_ctile_map
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainK0BlockLoop
>(
...
...
@@ -67,6 +68,22 @@ __global__ void
b_element_op
,
c_element_op
,
block_2_ctile_map
);
#else
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_c_grid
;
ignore
=
p_c0_grid
;
ignore
=
p_c1_grid
;
ignore
=
a_grid_desc_k0_m_k1
;
ignore
=
b_grid_desc_k0_n_k1
;
ignore
=
c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
;
ignore
=
c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
;
ignore
=
c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
c_element_op
;
ignore
=
block_2_ctile_map
;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template
<
...
...
include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
View file @
5e6cca6f
...
...
@@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
DataType
value
)
{
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
DataType
,
DataType
>
;
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
5e6cca6f
...
...
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
// TODO make this logic more generic for more sub-dword datatype
if
constexpr
(
SrcVectorDim
!=
DstVectorDim
&&
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
((
is_same
<
half_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
half_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
2
==
0
&&
DstScalarPerVector
%
2
==
0
)
||
(
is_same
<
int8_t
,
remove_cvref_t
<
SrcData
>>::
value
&&
is_same
<
int8_t
,
remove_cvref_t
<
DstData
>>::
value
&&
SrcScalarPerVector
%
4
==
0
&&
DstScalarPerVector
%
4
==
0
)))
{
// each transpose does
// DstScalarPerVector # of src vectors in src_thread_scratch_
...
...
include/ck/utility/common_header.hpp
View file @
5e6cca6f
...
...
@@ -13,6 +13,7 @@
#include "functional3.hpp"
#include "functional4.hpp"
#include "enable_if.hpp"
#include "ignore.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
...
...
@@ -30,6 +31,7 @@
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
#include "get_id.hpp"
#include "synchronization.hpp"
#include "amd_address_space.hpp"
...
...
include/ck/utility/data_type.hpp
View file @
5e6cca6f
...
...
@@ -992,77 +992,6 @@ inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
return
uint16_t
(
u
.
int32
>>
16
);
}
// TODO: deprecate this
template
<
typename
T
>
struct
inner_product_with_conversion
{
template
<
typename
X
,
index_t
N
>
__device__
T
operator
()(
typename
vector_type
<
X
,
N
>::
type
a
,
typename
vector_type
<
X
,
N
>::
type
b
)
const
{
const
vector_type
<
X
,
N
>
a_vector
{
a
};
const
vector_type
<
X
,
N
>
b_vector
{
b
};
T
acc
=
0
;
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
i
)
{
acc
+=
type_convert
<
T
>
(
a_vector
.
Scalars
()[
i
])
*
type_convert
<
T
>
(
b_vector
.
Scalars
()[
i
]);
});
return
acc
;
}
__device__
T
operator
()(
float_t
a
,
float_t
b
)
const
{
return
type_convert
<
T
>
(
a
)
*
type_convert
<
T
>
(
b
);
}
__device__
T
operator
()(
int8x4_t
a
,
int8x4_t
b
)
const
{
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
4
>
b_vector
{
b
};
T
acc
=
0
;
static_for
<
0
,
4
,
1
>
{}([
&
](
auto
i
)
{
acc
+=
type_convert
<
T
>
(
a_vector
.
AsType
<
int8_t
>
()[
i
])
*
type_convert
<
T
>
(
b_vector
.
AsType
<
int8_t
>
()[
i
]);
});
return
acc
;
}
__device__
T
operator
()(
int8x8_t
a
,
int8x8_t
b
)
const
{
const
vector_type
<
int8_t
,
8
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
8
>
b_vector
{
b
};
T
acc
=
0
;
static_for
<
0
,
8
,
1
>
{}([
&
](
auto
i
)
{
acc
+=
type_convert
<
T
>
(
a_vector
.
AsType
<
int8_t
>
()[
i
])
*
type_convert
<
T
>
(
b_vector
.
AsType
<
int8_t
>
()[
i
]);
});
return
acc
;
}
__device__
T
operator
()(
int8x16_t
a
,
int8x16_t
b
)
const
{
const
vector_type
<
int8_t
,
16
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
16
>
b_vector
{
b
};
T
acc
=
0
;
static_for
<
0
,
16
,
1
>
{}([
&
](
auto
i
)
{
acc
+=
type_convert
<
T
>
(
a_vector
.
AsType
<
int8_t
>
()[
i
])
*
type_convert
<
T
>
(
b_vector
.
AsType
<
int8_t
>
()[
i
]);
});
return
acc
;
}
};
template
<
typename
T
>
struct
NumericLimits
{
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
5e6cca6f
#pragma once
#include "amd_buffer_addressing.hpp"
#include "c_style_pointer_cast.hpp"
#include "config.hpp"
#include "enable_if.hpp"
#include "c_style_pointer_cast.hpp"
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
namespace
ck
{
// T may be scalar or vector
// X may be scalar or vector
// T and X have same scalar type
// X contains multiple T
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
...
...
@@ -316,9 +321,7 @@ struct DynamicBuffer
{
if
(
is_valid_element
)
{
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
// calling it
atomicAdd
(
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
]),
x
);
atomic_add
<
X
>
(
c_style_pointer_cast
<
X
*>
(
&
p_data_
[
i
]),
x
);
}
}
}
...
...
include/ck/utility/generic_memory_space_atomic_add.hpp
0 → 100644
View file @
5e6cca6f
#pragma once
#include "data_type.hpp"
namespace
ck
{
template
<
typename
X
>
__device__
X
atomic_add
(
X
*
p_dst
,
const
X
&
x
);
template
<
>
__device__
int32_t
atomic_add
<
int32_t
>
(
int32_t
*
p_dst
,
const
int32_t
&
x
)
{
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
__device__
uint32_t
atomic_add
<
uint32_t
>
(
uint32_t
*
p_dst
,
const
uint32_t
&
x
)
{
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
__device__
float
atomic_add
<
float
>
(
float
*
p_dst
,
const
float
&
x
)
{
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
__device__
float2_t
atomic_add
<
float2_t
>
(
float2_t
*
p_dst
,
const
float2_t
&
x
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
const
vector_type
<
float
,
2
>
vx
{
x
};
vector_type
<
float
,
2
>
vy
{
0
};
vy
.
template
AsType
<
float
>()(
I0
)
=
atomicAdd
(
c_style_pointer_cast
<
float
*>
(
p_dst
),
vx
.
template
AsType
<
float
>()[
I0
]);
vy
.
template
AsType
<
float
>()(
I1
)
=
atomicAdd
(
c_style_pointer_cast
<
float
*>
(
p_dst
)
+
1
,
vx
.
template
AsType
<
float
>()[
I1
]);
return
vy
.
template
AsType
<
float2_t
>()[
I0
];
}
}
// namespace ck
include/ck/utility/math_v2.hpp
View file @
5e6cca6f
#ifndef CK_MATH_V2_HPP
#define CK_MATH_V2_HPP
#include <cmath>
#include "data_type.hpp"
#include "half.hpp"
namespace
ck
{
namespace
math
{
static
inline
__device__
half_t
abs
(
half_t
x
)
{
return
__habs
(
x
);
};
static
inline
__device__
half_t
sqrtf
(
half_t
x
)
{
return
hsqrt
(
x
);
};
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
__hisnan
(
x
);
};
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
int8_t
abs
(
int8_t
x
)
{
int8_t
sgn
=
x
>>
(
8
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
int32_t
abs
(
int32_t
x
)
{
int32_t
sgn
=
x
>>
(
32
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__host__
half_t
abs
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
half_float
::
half
abs_xx
=
half_float
::
abs
(
xx
);
half_t
abs_x
=
*
reinterpret_cast
<
half_t
*>
(
&
abs_xx
);
return
abs_x
;
};
static
inline
__host__
float
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
double
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
int8_t
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
int32_t
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
bool
isnan
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
return
half_float
::
isnan
(
xx
);
};
}
// namespace math
}
// namespace ck
...
...
include/ck/utility/reduction_common.hpp
View file @
5e6cca6f
...
...
@@ -33,7 +33,7 @@ namespace ck {
struct
float_equal_one
{
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
return
x
<=
static_cast
<
T
>
(
1.0
f
)
and
x
>=
static_cast
<
T
>
(
1.0
f
);
};
...
...
@@ -42,7 +42,7 @@ struct float_equal_one
struct
float_equal_zero
{
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
__host__
__device__
inline
bool
operator
()(
T
x
)
{
return
x
<=
static_cast
<
T
>
(
0.0
f
)
and
x
>=
static_cast
<
T
>
(
0.0
f
);
};
...
...
include/ck/utility/transpose_vectors.hpp
View file @
5e6cca6f
...
...
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
half_t
,
NX
,
NY
>
{
// we got [NY * NX] am
m
ount of S data to be transposed
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
...
...
@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
}
};
// transpose int8 4x4
__device__
void
transpose_int8_4x4
(
const
int8x4_t
&
x0
,
const
int8x4_t
&
x1
,
const
int8x4_t
&
x2
,
const
int8x4_t
&
x3
,
int8x4_t
&
y0
,
int8x4_t
&
y1
,
int8x4_t
&
y2
,
int8x4_t
&
y3
)
{
int32_t
t0
,
t1
;
int32_t
z0
,
z1
,
z2
,
z3
;
constexpr
int32_t
m0
=
0x05010400
;
constexpr
int32_t
m1
=
0x05040100
;
constexpr
int32_t
m2
=
0x07060302
;
constexpr
int32_t
m3
=
0x07030602
;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
// clang-format off
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m0
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z0
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z1
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t0
)
:
"v"
(
bit_cast
<
int32_t
>
(
x1
)),
"v"
(
bit_cast
<
int32_t
>
(
x0
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
t1
)
:
"v"
(
bit_cast
<
int32_t
>
(
x3
)),
"v"
(
bit_cast
<
int32_t
>
(
x2
)),
"s"
(
m3
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z2
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m1
));
asm
volatile
(
"v_perm_b32 %0, %1, %2, %3"
:
"=v"
(
z3
)
:
"v"
(
bit_cast
<
int32_t
>
(
t1
)),
"v"
(
bit_cast
<
int32_t
>
(
t0
)),
"s"
(
m2
));
// clang-format on
y0
=
bit_cast
<
int8x4_t
>
(
z0
);
y1
=
bit_cast
<
int8x4_t
>
(
z1
);
y2
=
bit_cast
<
int8x4_t
>
(
z2
);
y3
=
bit_cast
<
int8x4_t
>
(
z3
);
}
template
<
index_t
NX
,
index_t
NY
>
struct
transpose_vectors
<
int8_t
,
NX
,
NY
>
{
// we got [NY * NX] amount of S data to be transposed
static
constexpr
index_t
s_per_x
=
NY
;
static
constexpr
index_t
s_per_y
=
NX
;
using
S
=
int8_t
;
using
VX
=
vector_type
<
int8_t
,
s_per_x
>
;
using
VY
=
vector_type
<
int8_t
,
s_per_y
>
;
__device__
void
operator
()(
const
StaticallyIndexedArray
<
const
VX
&
,
NX
>&
vx_tuple
,
StaticallyIndexedArray
<
VY
&
,
NY
>&
vy_tuple
)
{
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static_assert
((
NX
%
4
==
0
&&
NY
%
4
==
0
),
"wrong!"
);
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for
<
0
,
NY
,
4
>
{}([
&
](
auto
iy
)
{
static_for
<
0
,
NX
,
4
>
{}([
&
](
auto
ix
)
{
// reference to 4 int8 data from vx_tuple
const
auto
&
x_s4_0
=
vx_tuple
[
ix
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_1
=
vx_tuple
[
ix
+
I1
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_2
=
vx_tuple
[
ix
+
I2
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
const
auto
&
x_s4_3
=
vx_tuple
[
ix
+
I3
].
template
AsType
<
int8x4_t
>()[
iy
/
I4
];
// reference to 4 int8 data from vy_tuple
auto
&
y_s4_0
=
vy_tuple
(
iy
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_1
=
vy_tuple
(
iy
+
I1
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_2
=
vy_tuple
(
iy
+
I2
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
auto
&
y_s4_3
=
vy_tuple
(
iy
+
I3
).
template
AsType
<
int8x4_t
>()(
ix
/
I4
);
// transpose
transpose_int8_4x4
(
x_s4_0
,
x_s4_1
,
x_s4_2
,
x_s4_3
,
y_s4_0
,
y_s4_1
,
y_s4_2
,
y_s4_3
);
});
});
}
};
}
// namespace ck
#endif
library/CMakeLists.txt
View file @
5e6cca6f
add_subdirectory
(
src/host_tensor
)
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/tensor_operation_instance/cpu
)
\ No newline at end of file
add_subdirectory
(
src/utility
)
add_subdirectory
(
src/tensor_operation_instance/cpu
)
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
5e6cca6f
...
...
@@ -26,7 +26,6 @@
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <cmath>
#include <cassert>
...
...
@@ -34,6 +33,8 @@
#include <string>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
...
...
@@ -42,34 +43,10 @@ namespace host_reduce {
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
T
>
static
inline
bool
float_equal_one
(
T
);
static
inline
bool
float_equal_one
(
float
x
)
{
return
x
==
1.0
f
;
};
static
inline
bool
float_equal_one
(
double
x
)
{
return
x
==
1.0
;
};
static
inline
bool
float_equal_one
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
1.0
f
);
};
template
<
typename
T
>
static
inline
bool
float_equal_zero
(
T
x
);
static
inline
bool
float_equal_zero
(
float
x
)
{
return
x
==
0.0
f
;
};
static
inline
bool
float_equal_zero
(
double
x
)
{
return
x
==
0.0
;
};
static
inline
bool
float_equal_zero
(
half_float
::
half
x
)
{
return
x
==
static_cast
<
half_float
::
half
>
(
0.0
f
);
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
using
std
::
abs
;
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
...
...
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
m
ax
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
M
ax
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
std
::
n
umeric
_l
imits
<
AccDataType
>::
l
owest
());
return
(
ck
::
N
umeric
L
imits
<
AccDataType
>::
L
owest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
...
...
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
...
...
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
int
&
accuIndex
,
int
currIndex
)
{
using
std
::
isnan
;
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
5e6cca6f
...
...
@@ -32,6 +32,7 @@
#include <functional>
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
...
...
@@ -196,10 +197,10 @@ struct ReductionHost
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check2
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
...
@@ -227,10 +228,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
...
@@ -263,13 +264,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
...
...
@@ -303,10 +304,10 @@ struct ReductionHost
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
)
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
float_equal_one
;
using
ck
::
host_reduce
::
float_equal_zero
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
...
@@ -330,10 +331,10 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
...
...
@@ -361,13 +362,13 @@ struct ReductionHost
posUnaryOp
(
accuVal
);
if
(
!
float_equal_one
(
alpha
))
if
(
!
float_equal_one
{}
(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
(
beta
))
if
(
!
float_equal_zero
{}
(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
...
...
library/include/ck/library/utility/conv_fwd_util.hpp
View file @
5e6cca6f
This diff is collapsed.
Click to expand it.
library/include/ck/library/utility/fill.hpp
0 → 100644
View file @
5e6cca6f
#pragma once
#include <algorithm>
#include <random>
#include "data_type.hpp"
namespace
ck
{
namespace
utils
{
// template <typename T, class Enable = void>
// struct FillUniform;
// TODO: what's wrong with this specialization???
// err: segmentation fault in mt19937 - infinite loop like.
// template <typename T>
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
// !std::is_same<T, bhalf_t>::value>::type>
// {
// int a_{0};
// int b_{5};
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen{11939};
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
// std::is_same<T, bhalf_t>::value>::type>
template
<
typename
T
>
struct
FillUniform
{
float
a_
{
0
};
float
b_
{
5
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
{
11939
};
std
::
uniform_real_distribution
<>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
};
template
<
typename
T
>
struct
FillMonotonicSeq
{
T
init_value_
{
0
};
T
step_
{
1
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
generate
(
first
,
last
,
[
=
,
n
=
init_value_
]()
mutable
{
auto
tmp
=
n
;
n
+=
step_
;
return
tmp
;
});
}
};
template
<
typename
T
>
struct
FillConstant
{
T
value_
{
0
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
fill
(
first
,
last
,
value_
);
}
};
}
// namespace utils
}
// namespace ck
library/include/ck/library/utility/op_instance_engine.hpp
0 → 100644
View file @
5e6cca6f
#pragma once
#include <cstdlib>
#include <limits>
#include <memory>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>
#include "check_err.hpp"
#include "device_base.hpp"
#include "functional2.hpp"
namespace
ck
{
namespace
utils
{
struct
ProfileBestConfig
{
std
::
string
best_op_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_tflops
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
std
::
numeric_limits
<
float
>::
max
();
};
/**
* @brief This class describes an operation instance(s).
*
* Op instance defines a particular specializations of operator
* template. Thanks to this specific input/output data types, data
* layouts and modifying elementwise operations it is able to create
* it's input/output tensors, provide pointers to instances which
* can execute it and all operation specific parameters.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstance
{
public:
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
OpInstance
()
=
default
;
OpInstance
(
const
OpInstance
&
)
=
default
;
OpInstance
&
operator
=
(
const
OpInstance
&
)
=
default
;
virtual
~
OpInstance
(){};
virtual
InTensorsTuple
GetInputTensors
()
const
=
0
;
virtual
TensorPtr
<
OutDataType
>
GetOutputTensor
()
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
(
tensor_operation
::
device
::
BaseOperator
*
)
const
=
0
;
virtual
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseArgument
>
MakeArgumentPointer
(
tensor_operation
::
device
::
BaseOperator
*
,
const
DeviceBuffers
&
,
const
DeviceMemPtr
&
)
const
=
0
;
virtual
std
::
size_t
GetFlops
()
const
=
0
;
virtual
std
::
size_t
GetBtype
()
const
=
0
;
};
/**
* @brief A generic operation instance run engine.
*/
template
<
typename
OutDataType
,
typename
...
InArgTypes
>
class
OpInstanceRunEngine
{
public:
using
OpInstanceT
=
OpInstance
<
InArgTypes
...,
OutDataType
>
;
template
<
typename
T
>
using
TensorPtr
=
std
::
unique_ptr
<
Tensor
<
T
>>
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
using
InTensorsTuple
=
std
::
tuple
<
TensorPtr
<
InArgTypes
>
...
>
;
using
DeviceBuffers
=
std
::
vector
<
DeviceMemPtr
>
;
using
InArgsTypesTuple
=
std
::
tuple
<
InArgTypes
...
>
;
OpInstanceRunEngine
()
=
delete
;
template
<
typename
ReferenceOp
=
std
::
function
<
void
()>
>
OpInstanceRunEngine
(
const
OpInstanceT
&
op_instance
,
const
ReferenceOp
&
reference_op
=
ReferenceOp
{})
:
op_instance_
{
op_instance
}
{
in_tensors_
=
op_instance_
.
GetInputTensors
();
out_tensor_
=
op_instance_
.
GetOutputTensor
();
if
constexpr
(
std
::
is_invocable_v
<
ReferenceOp
,
const
Tensor
<
InArgTypes
>&
...,
Tensor
<
OutDataType
>&>
)
{
ref_output_
=
op_instance_
.
GetOutputTensor
();
CallRefOpUnpackArgs
(
reference_op
,
std
::
make_index_sequence
<
kNInArgs_
>
{});
}
AllocateDeviceInputTensors
(
std
::
make_index_sequence
<
kNInArgs_
>
{});
out_device_buffer_
=
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
OutDataType
)
*
out_tensor_
->
mDesc
.
GetElementSpace
());
out_device_buffer_
->
SetZero
();
}
virtual
~
OpInstanceRunEngine
(){};
template
<
typename
OpInstancePtr
>
bool
Test
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
)
{
bool
res
{
true
};
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
invoker
->
Run
(
argument
.
get
());
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Test: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
check_err
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
out_device_buffer_
->
SetZero
();
}
}
return
res
;
}
template
<
typename
OpInstancePtr
>
ProfileBestConfig
Profile
(
const
std
::
vector
<
OpInstancePtr
>&
op_ptrs
,
int
nrepeat
=
100
,
bool
do_verification
=
false
,
bool
do_log
=
false
)
{
bool
res
{
true
};
ProfileBestConfig
best_config
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
invoker
=
op_instance_
.
MakeInvokerPointer
(
op_ptr
.
get
());
auto
argument
=
op_instance_
.
MakeArgumentPointer
(
op_ptr
.
get
(),
in_device_buffers_
,
out_device_buffer_
);
if
(
op_ptr
->
IsSupportedArgument
(
argument
.
get
()))
{
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
std
::
size_t
flops
=
op_instance_
.
GetFlops
();
std
::
size_t
num_btype
=
op_instance_
.
GetBtype
();
float
tflops
=
static_cast
<
float
>
(
flops
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
<
best_config
.
best_tflops
)
{
best_config
.
best_op_name
=
op_name
;
best_config
.
best_tflops
=
tflops
;
best_config
.
best_gb_per_sec
=
gb_per_sec
;
best_config
.
best_avg_time
=
avg_time
;
}
if
(
do_verification
)
{
out_device_buffer_
->
FromDevice
(
out_tensor_
->
mData
.
data
());
if
(
!
ref_output_
)
{
throw
std
::
runtime_error
(
"OpInstanceRunEngine::Profile: Reference value not availabe."
" You have to provide reference function."
);
}
// TODO: enable flexible use of custom check_error functions
res
=
res
&&
CheckErr
(
out_tensor_
->
mData
,
ref_output_
->
mData
);
if
(
do_log
)
{}
}
out_device_buffer_
->
SetZero
();
}
}
return
best_config
;
}
void
SetAtol
(
double
a
)
{
atol_
=
a
;
}
void
SetRtol
(
double
r
)
{
rtol_
=
r
;
}
private:
template
<
typename
F
,
std
::
size_t
...
Is
>
void
CallRefOpUnpackArgs
(
const
F
&
f
,
std
::
index_sequence
<
Is
...
>
)
const
{
f
(
*
std
::
get
<
Is
>
(
in_tensors_
)...,
*
ref_output_
);
}
template
<
std
::
size_t
...
Is
>
void
AllocateDeviceInputTensors
(
std
::
index_sequence
<
Is
...
>
)
{
(
AllocateDeviceInputTensorsImpl
<
Is
>
(),
...);
}
template
<
std
::
size_t
Index
>
void
AllocateDeviceInputTensorsImpl
()
{
const
auto
&
ts
=
std
::
get
<
Index
>
(
in_tensors_
);
in_device_buffers_
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
std
::
tuple_element_t
<
Index
,
InArgsTypesTuple
>
)
*
ts
->
mDesc
.
GetElementSpace
()))
->
ToDevice
(
ts
->
mData
.
data
());
}
static
constexpr
std
::
size_t
kNInArgs_
=
std
::
tuple_size_v
<
InTensorsTuple
>
;
const
OpInstanceT
&
op_instance_
;
double
rtol_
{
1e-5
};
double
atol_
{
1e-8
};
InTensorsTuple
in_tensors_
;
TensorPtr
<
OutDataType
>
out_tensor_
;
TensorPtr
<
OutDataType
>
ref_output_
;
DeviceBuffers
in_device_buffers_
;
DeviceMemPtr
out_device_buffer_
;
template
<
typename
T
>
bool
CheckErr
(
const
std
::
vector
<
T
>&
dev_out
,
const
std
::
vector
<
T
>&
ref_out
)
const
{
return
ck
::
utils
::
check_err
(
dev_out
,
ref_out
,
"Error: incorrect results!"
,
atol_
,
rtol_
);
}
};
}
// namespace utils
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
View file @
5e6cca6f
...
...
@@ -28,19 +28,19 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
//#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
256
,
32
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
64
,
128
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
64
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
128
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
32
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
32
,
128
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
32
,
32
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
32
,
64
,
32
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
128
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
128
,
32
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
64
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
,
DeviceGemm_Xdl_CShuffle
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
1
,
64
,
32
,
64
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
>
// clang-format on
>
;
...
...
library/src/utility/CMakeLists.txt
0 → 100644
View file @
5e6cca6f
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
)
set
(
CONV_FWD_UTIL_SOURCE
conv_fwd_util.cpp
)
add_library
(
conv_fwd_util SHARED
${
CONV_FWD_UTIL_SOURCE
}
)
target_link_libraries
(
conv_fwd_util PRIVATE host_tensor
)
target_compile_features
(
conv_fwd_util PUBLIC
)
set_target_properties
(
conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:
${
HALF_INCLUDE_DIR
}
>
)
clang_tidy_check
(
conv_fwd_util
)
library/src/utility/conv_fwd_util.cpp
0 → 100644
View file @
5e6cca6f
#include "conv_fwd_util.hpp"
namespace
ck
{
namespace
utils
{
namespace
conv
{
/**
* @brief Calculate number of FLOPs for Convolution
*
* @param[in] N Batch size.
* @param[in] C Number of input channels.
* @param[in] K Number of output channels.
* @param[in] filter_spatial_lengths Filter spatial dimensions lengths.
* @param[in] output_spatial_lengths Convolution output spatial dimensions
* lengths.
*
* @return The number of flops.
*/
std
::
size_t
get_flops
(
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
K
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
)
{
// 2 * N * K * <output spatial lengths product> * C * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
N
*
K
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
C
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
),
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
}
ConvParams
::
ConvParams
()
:
num_dim_spatial
(
2
),
N
(
128
),
K
(
256
),
C
(
192
),
filter_spatial_lengths
(
2
,
3
),
input_spatial_lengths
(
2
,
71
),
conv_filter_strides
(
2
,
2
),
conv_filter_dilations
(
2
,
1
),
input_left_pads
(
2
,
1
),
input_right_pads
(
2
,
1
)
{
}
ConvParams
::
ConvParams
(
ck
::
index_t
n_dim
,
ck
::
index_t
n_batch
,
ck
::
index_t
n_out_channels
,
ck
::
index_t
n_in_channels
,
const
std
::
vector
<
ck
::
index_t
>&
filters_len
,
const
std
::
vector
<
ck
::
index_t
>&
input_len
,
const
std
::
vector
<
ck
::
index_t
>&
strides
,
const
std
::
vector
<
ck
::
index_t
>&
dilations
,
const
std
::
vector
<
ck
::
index_t
>&
left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
right_pads
)
:
num_dim_spatial
(
n_dim
),
N
(
n_batch
),
K
(
n_out_channels
),
C
(
n_in_channels
),
filter_spatial_lengths
(
filters_len
),
input_spatial_lengths
(
input_len
),
conv_filter_strides
(
strides
),
conv_filter_dilations
(
dilations
),
input_left_pads
(
left_pads
),
input_right_pads
(
right_pads
)
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
}
std
::
vector
<
ck
::
index_t
>
ConvParams
::
GetOutputSpatialLengths
()
const
{
if
(
filter_spatial_lengths
.
size
()
!=
num_dim_spatial
||
input_spatial_lengths
.
size
()
!=
num_dim_spatial
||
conv_filter_strides
.
size
()
!=
num_dim_spatial
||
conv_filter_dilations
.
size
()
!=
num_dim_spatial
||
input_left_pads
.
size
()
!=
num_dim_spatial
||
input_right_pads
.
size
()
!=
num_dim_spatial
)
{
throw
(
std
::
runtime_error
(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"
));
}
std
::
vector
<
ck
::
index_t
>
out_spatial_len
(
num_dim_spatial
,
0
);
for
(
ck
::
index_t
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
idx_eff
=
(
filter_spatial_lengths
[
i
]
-
1
)
*
conv_filter_dilations
[
i
]
+
1
;
out_spatial_len
[
i
]
=
(
input_spatial_lengths
[
i
]
+
input_left_pads
[
i
]
+
input_right_pads
[
i
]
-
idx_eff
)
/
conv_filter_strides
[
i
]
+
1
;
}
return
out_spatial_len
;
}
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[])
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
}
HostTensorDescriptor
get_output_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
// namespace conv
}
// namespace utils
}
// namespace ck
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
ck
::
utils
::
conv
::
ConvParams
&
p
)
{
os
<<
"ConvParams {"
<<
"
\n
num_dim_spatial: "
<<
p
.
num_dim_spatial
<<
"
\n
N: "
<<
p
.
N
<<
"
\n
K: "
<<
p
.
K
<<
"
\n
C: "
<<
p
.
C
<<
"
\n
filter_spatial_lengths: "
<<
p
.
filter_spatial_lengths
<<
"
\n
input_spatial_lengths: "
<<
p
.
input_spatial_lengths
<<
"
\n
conv_filter_strides: "
<<
p
.
conv_filter_strides
<<
"
\n
conv_filter_dilations: "
<<
p
.
conv_filter_dilations
<<
"
\n
input_left_pads: "
<<
p
.
input_left_pads
<<
"
\n
input_right_pads: "
<<
p
.
input_right_pads
;
return
os
;
}
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment