Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4e5e68a1
"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "78ee07eabd876fd6702443859c46d7f9a67dc351"
Commit
4e5e68a1
authored
Oct 11, 2021
by
Jing Zhang
Browse files
add bias
parents
157ce4cc
af84fba3
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
135 additions
and
32 deletions
+135
-32
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+83
-11
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+7
-0
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+5
-1
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+7
-2
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+28
-14
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+5
-4
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
4e5e68a1
...
@@ -17,7 +17,7 @@ template <typename GridwiseGemm,
...
@@ -17,7 +17,7 @@ template <typename GridwiseGemm,
typename
FloatC
,
typename
FloatC
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
,
typename
CGridDesc_K
0_K1
_N_H0_H1_H2_W0_W1_W2
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
>
bool
HasMainE0BlockLoop
>
__global__
void
__global__
void
...
@@ -27,10 +27,11 @@ __global__ void
...
@@ -27,10 +27,11 @@ __global__ void
kernel_gemm_dlops_v2
(
kernel_gemm_dlops_v2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
AGridDesc_E0_E1_K0_K1_E2
a_e0_e1_k0_k1_e2_grid_desc
,
const
AGridDesc_E0_E1_K0_K1_E2
a_e0_e1_k0_k1_e2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
CGridDesc_K
0_K1
_N_H0_H1_H2_W0_W1_W2
c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
c_blockid_to_k_n_h_w_block_cluster_adaptor
)
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
c_blockid_to_k_n_h_w_block_cluster_adaptor
)
{
{
constexpr
index_t
shared_block_size
=
constexpr
index_t
shared_block_size
=
...
@@ -40,11 +41,12 @@ __global__ void
...
@@ -40,11 +41,12 @@ __global__ void
GridwiseGemm
::
Run
(
p_a_grid
,
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
p_shared_block
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
}
...
@@ -57,7 +59,7 @@ template <typename GridwiseGemm,
...
@@ -57,7 +59,7 @@ template <typename GridwiseGemm,
typename
FloatC
,
typename
FloatC
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
,
typename
CGridDesc_K
0_K1
_N_H0_H1_H2_W0_W1_W2
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
>
bool
HasMainE0BlockLoop
>
__global__
void
__global__
void
...
@@ -66,10 +68,11 @@ __global__ void
...
@@ -66,10 +68,11 @@ __global__ void
#endif
#endif
kernel_gemm_dlops_v2
(
const
FloatAB
*
__restrict__
p_a_grid
,
kernel_gemm_dlops_v2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_e0_e1_k0_k1_e2_grid_desc
,
const
void
CONSTANT
*
p_a_e0_e1_k0_k1_e2_grid_desc
,
const
void
CONSTANT
*
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
void
CONSTANT
*
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
void
CONSTANT
*
p_c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
void
CONSTANT
*
p_c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
void
CONSTANT
*
p_c_blockid_to_k_n_h_w_block_cluster_adaptor
)
const
void
CONSTANT
*
p_c_blockid_to_k_n_h_w_block_cluster_adaptor
)
{
{
// first cast void CONSTANT void* to void*
// first cast void CONSTANT void* to void*
...
@@ -80,9 +83,9 @@ __global__ void
...
@@ -80,9 +83,9 @@ __global__ void
const
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
const
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
*
reinterpret_cast
<
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
*>
(
*
reinterpret_cast
<
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
*>
(
cast_pointer_to_generic_address_space
(
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
));
cast_pointer_to_generic_address_space
(
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
));
const
auto
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
=
const
auto
c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
=
*
reinterpret_cast
<
const
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
*>
(
*
reinterpret_cast
<
const
CGridDesc_K
0_K1
_N_H0_H1_H2_W0_W1_W2
*>
(
cast_pointer_to_generic_address_space
(
p_c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
));
cast_pointer_to_generic_address_space
(
p_c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
));
const
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
const
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
*
reinterpret_cast
<
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
*>
(
*
reinterpret_cast
<
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
*>
(
cast_pointer_to_generic_address_space
(
p_c_blockid_to_k_n_h_w_block_cluster_adaptor
));
cast_pointer_to_generic_address_space
(
p_c_blockid_to_k_n_h_w_block_cluster_adaptor
));
...
@@ -94,11 +97,12 @@ __global__ void
...
@@ -94,11 +97,12 @@ __global__ void
GridwiseGemm
::
Run
(
p_a_grid
,
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
p_shared_block
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_k
0_k1
_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
}
...
@@ -143,7 +147,7 @@ template <index_t BlockSize,
...
@@ -143,7 +147,7 @@ template <index_t BlockSize,
typename
CGlobalStepHacks
,
typename
CGlobalStepHacks
,
typename
AGlobalMoveSliceWindowStepHacks
,
typename
AGlobalMoveSliceWindowStepHacks
,
typename
BGlobalMoveSliceWindowStepHacks
,
typename
BGlobalMoveSliceWindowStepHacks
,
index
_t
activ_type
=
0
>
ActivTypeEnum
_t
activ_type
=
ActivTypeEnum_t
::
None
>
struct
GridwiseGemmDlops_km_kn_mn_v3
struct
GridwiseGemmDlops_km_kn_mn_v3
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
@@ -159,6 +163,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -159,6 +163,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static
constexpr
auto
NPerBlock
=
I1
;
static
constexpr
auto
NPerBlock
=
I1
;
static
constexpr
FloatC
alpha
=
0.3
;
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
constexpr
auto
max_lds_align
=
Number
<
ABlockTransferDstScalarPerVector_E2
>
{};
constexpr
auto
max_lds_align
=
Number
<
ABlockTransferDstScalarPerVector_E2
>
{};
...
@@ -338,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -338,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
decltype
(
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
CGridDesc_K_N_Ho_Wo
{}));
decltype
(
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
CGridDesc_K_N_Ho_Wo
{}));
__host__
__device__
static
constexpr
auto
MakeBiasK0K1GridDescriptor
(
const
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
&
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
)
{
const
auto
K0
=
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetLength
(
I0
);
const
auto
K1
=
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetLength
(
I1
);
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K0
,
K1
));
}
template
<
bool
HasMainE0BlockLoop
>
template
<
bool
HasMainE0BlockLoop
>
__device__
static
void
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatC
*
__restrict__
p_bias_global
,
FloatC
*
__restrict__
p_c_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
FloatAB
*
__restrict__
p_shared_block
,
const
AGridDesc_E0_E1_K0_K1_E2
&
a_e0_e1_k0_k1_e2_grid_desc
,
const
AGridDesc_E0_E1_K0_K1_E2
&
a_e0_e1_k0_k1_e2_grid_desc
,
...
@@ -350,10 +366,16 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -350,10 +366,16 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
{
{
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_c_global
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetElementSpaceSize
());
p_c_global
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetElementSpaceSize
());
...
@@ -729,7 +751,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -729,7 +751,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static_for
<
0
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
activ_type
==
1
)
if
constexpr
(
activ_type
==
1
)
{
{
c_thread_buf
(
i
)
=
c_thread_buf
[
i
]
>=
0
?
c_thread_buf
[
i
]
:
0.0
;
c_thread_buf
(
i
)
=
c_thread_buf
[
i
]
>=
0
?
c_thread_buf
[
i
]
:
alpha
*
c_thread_buf
[
i
];
}
}
else
if
constexpr
(
activ_type
==
2
)
else
if
constexpr
(
activ_type
==
2
)
{
{
...
@@ -745,6 +768,55 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -745,6 +768,55 @@ struct GridwiseGemmDlops_km_kn_mn_v3
});
});
}
}
// bias
{
constexpr
auto
bias_k0_k1_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
KPerThread
>
{}));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatC
,
bias_k0_k1_thread_desc
.
GetElementSpaceSize
(),
true
>
bias_thread_buf
;
const
index_t
k_thread_data_on_global
=
k_thread_id
*
KPerThread
;
auto
bias_threadwise_transfer
=
ThreadwiseTensorSliceTransfer_v2
<
FloatC
,
FloatC
,
decltype
(
bias_k0_k1_grid_desc
),
decltype
(
bias_k0_k1_thread_desc
),
Sequence
<
I1
,
Number
<
KPerThread
>
{}
>
,
Sequence
<
0
,
1
>
,
1
,
CThreadTransferDstScalarPerVector
,
false
,
true
>
(
bias_k0_k1_grid_desc
,
make_multi_index
(
k_block_work_id
,
k_thread_data_on_global
));
constexpr
auto
bias_k0_k1_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
0
>
{}));
bias_threadwise_transfer
.
Run
(
bias_k0_k1_grid_desc
,
bias_global_buf
,
bias_k0_k1_thread_desc
,
make_tuple
(
I0
,
I0
),
bias_thread_buf
,
bias_k0_k1_global_tensor_step_hacks
);
static_for
<
0
,
KPerThread
,
1
>
{}([
&
](
auto
ki
)
{
static_for
<
0
,
HoPerThread
,
1
>
{}([
&
](
auto
hi
)
{
static_for
<
0
,
WoPerThread
,
1
>
{}([
&
](
auto
wi
)
{
constexpr
index_t
c_offset
=
c_k1_n_h2_w2_thread_gemm_desc
.
CalculateOffset
(
make_tuple
(
ki
,
0
,
hi
,
wi
));
c_thread_buf
(
Number
<
c_offset
>
{})
=
c_thread_buf
[
Number
<
c_offset
>
{}]
+
bias_thread_buf
[
ki
];
});
});
});
}
// output: register to global memory
// output: register to global memory
{
{
// hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
// hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
...
...
composable_kernel/include/utility/config.hpp
View file @
4e5e68a1
...
@@ -127,6 +127,13 @@ enum InMemoryDataOperationEnum_t
...
@@ -127,6 +127,13 @@ enum InMemoryDataOperationEnum_t
AtomicAdd
AtomicAdd
};
};
enum
ActivTypeEnum_t
{
None
=
0
,
LeakyRelu
,
Sigmoid
};
// index type
// index type
using
index_t
=
int32_t
;
using
index_t
=
int32_t
;
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
4e5e68a1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
template
<
typename
TInWei
,
template
<
typename
TInWei
,
typename
TAcc
,
typename
TAcc
,
typename
TOut
,
typename
TOut
,
ck
::
index
_t
activ_type
,
ck
::
ActivTypeEnum
_t
activ_type
,
typename
InLengths
,
typename
InLengths
,
typename
WeiLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
OutLengths
,
...
@@ -24,6 +24,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
...
@@ -24,6 +24,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
...
@@ -57,9 +58,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
...
@@ -57,9 +58,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
bias_k0_k1_device_buf
.
ToDevice
(
bias_k0_k1
.
mData
.
data
());
constexpr
index_t
InWeiVectorSize
=
8
;
constexpr
index_t
InWeiVectorSize
=
8
;
...
@@ -173,6 +176,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
...
@@ -173,6 +176,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
nrepeat
);
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
4e5e68a1
...
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
...
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
ABlockTransferDstScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
BThreadTransferSrcScalarPerVector_E2
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
index_t
CThreadTransferDstScalarPerVector_K
,
ck
::
index
_t
activ_type
>
ck
::
ActivTypeEnum
_t
activ_type
>
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
struct
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
{
{
template
<
typename
...
Wei
,
template
<
typename
...
Wei
,
...
@@ -46,6 +46,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -46,6 +46,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
int
nrepeat
)
const
const
int
nrepeat
)
const
{
{
...
@@ -61,7 +62,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -61,7 +62,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const
auto
C0
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I1
);
const
auto
C0
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I1
);
const
auto
Hi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I2
);
const
auto
Hi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I2
);
const
auto
Wi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I3
);
const
auto
Wi
=
in_n_c0_hi_wi_c1_global_desc
.
GetLength
(
I3
);
//const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
//
const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
K0
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I1
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
const
auto
Ho
=
out_n_k0_ho_wo_k1_global_desc
.
GetLength
(
I2
);
...
@@ -348,6 +349,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -348,6 +349,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -373,6 +375,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -373,6 +375,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -418,6 +421,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -418,6 +421,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
@@ -449,6 +453,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -449,6 +453,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
...
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
View file @
4e5e68a1
...
@@ -92,19 +92,17 @@ int main(int argc, char* argv[])
...
@@ -92,19 +92,17 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
4
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
5
]);
constexpr
index_t
activ_type
=
0
;
#if 0
#if 0
constexpr auto N
= Number<1>{};
constexpr auto N = Number<1>{};
constexpr auto Hi
= Number<1080>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi
= Number<1920>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y
= Number<3>{};
constexpr auto Y = Number<3>{};
constexpr auto X
= Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0
= Number<2>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1
= Number<8>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1
= Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0
= Number<8>{};
constexpr auto K0 = Number<8>{};
#elif
1
#elif
0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
...
@@ -114,7 +112,7 @@ int main(int argc, char* argv[])
...
@@ -114,7 +112,7 @@ int main(int argc, char* argv[])
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
C1
=
Number
<
8
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
constexpr
auto
K1
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
8
>
{};
constexpr
auto
K0
=
Number
<
8
>
{};
#elif
0
#elif
1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Hi
=
Number
<
270
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
constexpr
auto
Wi
=
Number
<
480
>
{};
...
@@ -166,7 +164,8 @@ int main(int argc, char* argv[])
...
@@ -166,7 +164,8 @@ int main(int argc, char* argv[])
using
out_data_t
=
int8_t
;
using
out_data_t
=
int8_t
;
#endif
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
);
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
bias_lengths_host
(
2
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
...
@@ -186,13 +185,18 @@ int main(int argc, char* argv[])
...
@@ -186,13 +185,18 @@ int main(int argc, char* argv[])
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
out_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
bias
.
mDesc
,
std
::
cout
<<
"bias: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
...
@@ -210,22 +214,27 @@ int main(int argc, char* argv[])
...
@@ -210,22 +214,27 @@ int main(int argc, char* argv[])
case
1
:
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
break
;
case
2
:
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
break
;
case
3
:
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
break
;
break
;
case
4
:
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
break
;
break
;
case
5
:
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
float
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
break
;
default:
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
...
@@ -254,6 +263,8 @@ int main(int argc, char* argv[])
...
@@ -254,6 +263,8 @@ int main(int argc, char* argv[])
in_right_pads_dev
);
in_right_pads_dev
);
};
};
constexpr
ck
::
ActivTypeEnum_t
activ_type
=
ActivTypeEnum_t
::
None
;
#if USE_CONV_FWD_V5R1_NCHWC
#if USE_CONV_FWD_V5R1_NCHWC
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
if
(
algo
==
ConvForwardAlgo
::
V5R1NCHWC
)
{
{
...
@@ -272,6 +283,7 @@ int main(int argc, char* argv[])
...
@@ -272,6 +283,7 @@ int main(int argc, char* argv[])
tmp
[
I6
],
tmp
[
I6
],
in
,
in
,
wei
,
wei
,
bias
,
out_device
,
out_device
,
nrepeat
);
nrepeat
);
}
}
...
@@ -281,6 +293,7 @@ int main(int argc, char* argv[])
...
@@ -281,6 +293,7 @@ int main(int argc, char* argv[])
{
{
host_direct_convolution_nchwc
(
in
,
host_direct_convolution_nchwc
(
in
,
wei
,
wei
,
bias
,
out_host
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
make_tuple
(
conv_dilation_h
,
conv_dilation_w
),
...
@@ -294,6 +307,7 @@ int main(int argc, char* argv[])
...
@@ -294,6 +307,7 @@ int main(int argc, char* argv[])
{
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"bias: "
,
bias
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
...
...
host/host_tensor/include/host_conv.hpp
View file @
4e5e68a1
...
@@ -110,6 +110,7 @@ template <typename TIn,
...
@@ -110,6 +110,7 @@ template <typename TIn,
typename
InRightPads
>
typename
InRightPads
>
void
host_direct_convolution_nchwc
(
const
Tensor
<
TIn
>&
in
,
void
host_direct_convolution_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
out
,
Tensor
<
TOut
>&
out
,
const
ConvStrides
&
conv_strides
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
ConvDilations
&
conv_dilations
,
...
@@ -123,7 +124,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
...
@@ -123,7 +124,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
k0
,
auto
ho
,
auto
wo
,
auto
k1
)
{
double
v
=
0
;
double
v
=
0
;
const
int
k
=
k0
*
out
.
mDesc
.
GetLengths
()[
4
]
+
k1
;
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
for
(
int
c0
=
0
;
c0
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c0
)
{
{
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
for
(
int
c1
=
0
;
c1
<
wei
.
mDesc
.
GetLengths
()[
4
];
++
c1
)
...
@@ -138,14 +140,13 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
...
@@ -138,14 +140,13 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c0
,
hi
,
wi
,
c1
))
*
static_cast
<
const
double
>
(
static_cast
<
const
double
>
(
wei
(
k
,
c0
,
y
,
x
,
c1
));
wei
(
k0
*
out
.
mDesc
.
GetLengths
()[
4
]
+
k1
,
c0
,
y
,
x
,
c1
));
}
}
}
}
}
}
}
}
}
}
out
(
n
,
k0
,
ho
,
wo
,
k1
)
=
activ
(
v
,
activ_type
);
out
(
n
,
k0
,
ho
,
wo
,
k1
)
=
activ
(
v
,
activ_type
)
+
bias
(
k0
,
k1
)
;
};
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nchw
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment