Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3e298e42
"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "67533c798ca95172d7b9999dd28d9d6e57a67432"
Commit
3e298e42
authored
Oct 13, 2021
by
Jing Zhang
Browse files
add bias
parent
1e6d6782
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
116 additions
and
21 deletions
+116
-21
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
...l/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+84
-5
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+8
-0
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+5
-6
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+6
-2
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
.../driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+10
-6
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+1
-1
host/host_tensor/include/host_conv.hpp
host/host_tensor/include/host_conv.hpp
+2
-1
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
View file @
3e298e42
...
@@ -27,6 +27,7 @@ __global__ void
...
@@ -27,6 +27,7 @@ __global__ void
kernel_gemm_dlops_v2_add
(
kernel_gemm_dlops_v2_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_d_grid
,
FloatC
*
__restrict__
p_d_grid
,
const
AGridDesc_E0_E1_K0_K1_E2
a_e0_e1_k0_k1_e2_grid_desc
,
const
AGridDesc_E0_E1_K0_K1_E2
a_e0_e1_k0_k1_e2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -40,6 +41,7 @@ __global__ void
...
@@ -40,6 +41,7 @@ __global__ void
GridwiseGemm
::
Run
(
p_a_grid
,
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
p_shared_block
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
...
@@ -66,6 +68,7 @@ __global__ void
...
@@ -66,6 +68,7 @@ __global__ void
#endif
#endif
kernel_gemm_dlops_v2_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
kernel_gemm_dlops_v2_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_d_grid
,
FloatC
*
__restrict__
p_d_grid
,
const
void
CONSTANT
*
p_a_e0_e1_k0_k1_e2_grid_desc
,
const
void
CONSTANT
*
p_a_e0_e1_k0_k1_e2_grid_desc
,
const
void
CONSTANT
*
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
void
CONSTANT
*
p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -95,6 +98,7 @@ __global__ void
...
@@ -95,6 +98,7 @@ __global__ void
GridwiseGemm
::
Run
(
p_a_grid
,
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
p_shared_block
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
...
@@ -290,13 +294,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -290,13 +294,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K0
=
K
/
K1
;
const
auto
K0
=
K
/
K1
;
const
auto
H2
=
Number
<
HoPerThread
*
2
>
{}
;
const
auto
H2
=
HoPerThread
*
2
;
const
auto
H1
=
Number
<
HoPerBlock
/
HoPerThread
>
{};
const
auto
H1
=
Number
<
HoPerBlock
/
HoPerThread
>
{};
const
auto
H0
=
Number
<
Hox2
/
(
H1
*
H2
)
>
{}
;
const
auto
H0
=
Hox2
/
(
H1
*
H2
);
const
auto
W2
=
Number
<
WoPerThread
*
2
>
{}
;
const
auto
W2
=
WoPerThread
*
2
;
const
auto
W1
=
Number
<
WoPerBlock
/
WoPerThread
>
{};
const
auto
W1
=
Number
<
WoPerBlock
/
WoPerThread
>
{};
const
auto
W0
=
Number
<
Wox2
/
(
W1
*
W2
)
>
{}
;
const
auto
W0
=
Wox2
/
(
W1
*
W2
);
const
auto
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
=
transform_tensor_descriptor
(
const
auto
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
=
transform_tensor_descriptor
(
d_k_n_hox2_wox2_grid_desc
,
d_k_n_hox2_wox2_grid_desc
,
...
@@ -340,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -340,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
using
CBlockIdToBlockClusterAdaptor_K_N_H_W
=
decltype
(
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
CGridDesc_K_N_Ho_Wo
{}));
decltype
(
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
CGridDesc_K_N_Ho_Wo
{}));
__host__
__device__
static
constexpr
auto
MakeBiasK0K1GridDescriptor
(
const
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
&
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
)
{
const
auto
K0
=
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
GetLength
(
I0
);
const
auto
K1
=
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
GetLength
(
I1
);
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
K0
,
K1
));
}
template
<
bool
HasMainE0BlockLoop
>
template
<
bool
HasMainE0BlockLoop
>
__device__
static
void
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatAB
*
__restrict__
p_b_global
,
const
FloatC
*
__restrict__
p_bias_global
,
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_d_global
,
FloatAB
*
__restrict__
p_shared_block
,
FloatAB
*
__restrict__
p_shared_block
,
const
AGridDesc_E0_E1_K0_K1_E2
&
a_e0_e1_k0_k1_e2_grid_desc
,
const
AGridDesc_E0_E1_K0_K1_E2
&
a_e0_e1_k0_k1_e2_grid_desc
,
...
@@ -352,12 +366,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -352,12 +366,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
{
{
// constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
// constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
// BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
// constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
// DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2{};
// constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
// CBlockIdToBlockClusterAdaptor_K_N_H_W{};
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_d_global
,
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
GetElementSpaceSize
());
p_d_global
,
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
constexpr
auto
HasMainE1BlockLoop
=
CalculateHasMainE1BlockLoop
();
constexpr
auto
HasMainE1BlockLoop
=
CalculateHasMainE1BlockLoop
();
constexpr
auto
HasDoubleTailE1BlockLoop
=
CalculateHasDoubleTailE1BlockLoop
();
constexpr
auto
HasDoubleTailE1BlockLoop
=
CalculateHasDoubleTailE1BlockLoop
();
...
@@ -747,6 +775,57 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -747,6 +775,57 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
});
});
}
}
// bias
{
constexpr
auto
bias_k0_k1_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
KPerThread
>
{}));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
FloatC
,
bias_k0_k1_thread_desc
.
GetElementSpaceSize
(),
true
>
bias_thread_buf
;
const
index_t
k_thread_data_on_global
=
k_thread_id
*
KPerThread
;
auto
bias_threadwise_transfer
=
ThreadwiseTensorSliceTransfer_v2
<
FloatC
,
FloatC
,
decltype
(
bias_k0_k1_grid_desc
),
decltype
(
bias_k0_k1_thread_desc
),
Sequence
<
I1
,
Number
<
KPerThread
>
{}
>
,
Sequence
<
0
,
1
>
,
1
,
CThreadTransferDstScalarPerVector
,
false
,
true
>
(
bias_k0_k1_grid_desc
,
make_multi_index
(
k_block_work_id
,
k_thread_data_on_global
));
constexpr
auto
bias_k0_k1_global_tensor_step_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
0
>
{}));
bias_threadwise_transfer
.
Run
(
bias_k0_k1_grid_desc
,
bias_global_buf
,
bias_k0_k1_thread_desc
,
make_tuple
(
I0
,
I0
),
bias_thread_buf
,
bias_k0_k1_global_tensor_step_hacks
);
#if 1
static_for
<
0
,
KPerThread
,
1
>
{}([
&
](
auto
ki
)
{
static_for
<
0
,
HoPerThread
,
1
>
{}([
&
](
auto
hi
)
{
static_for
<
0
,
WoPerThread
,
1
>
{}([
&
](
auto
wi
)
{
constexpr
index_t
c_offset
=
c_k1_n_h2_w2_thread_gemm_desc
.
CalculateOffset
(
make_tuple
(
ki
,
0
,
hi
,
wi
));
c_thread_buf
(
Number
<
c_offset
>
{})
=
c_thread_buf
[
Number
<
c_offset
>
{}]
+
bias_thread_buf
[
ki
];
});
});
});
#endif
}
// Resize_Add
// Resize_Add
{
{
constexpr
auto
HoPerThreadx2
=
HoPerThread
*
2
;
constexpr
auto
HoPerThreadx2
=
HoPerThread
*
2
;
...
@@ -843,7 +922,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -843,7 +922,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
CThreadTransferDstScalarPerVector
,
InMemoryDataOperationEnum_t
::
Add
,
//
CGlobalMemoryDataOperation,
CGlobalMemoryDataOperation
,
1
,
1
,
true
>
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
,
true
>
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
,
make_multi_index
(
k_block_work_id
,
make_multi_index
(
k_block_work_id
,
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
3e298e42
...
@@ -537,6 +537,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -537,6 +537,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
#if 0
vector_type<half_t, 8> tmp{src_thread_data};
vector_type<half_t, 8> tmp{src_thread_data};
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
...
@@ -550,6 +551,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -550,6 +551,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_thread_addr_offset,
dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(half_t),
dst_wave_addr_offset + 4 * sizeof(half_t),
0);
0);
#else
llvm_amdgcn_raw_buffer_store_fp32x4
(
as_type
<
float4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
#endif
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
...
...
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
3e298e42
...
@@ -26,9 +26,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
...
@@ -26,9 +26,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
const
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1
,
const
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1
,
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1_out
,
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1_out
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -62,13 +62,13 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
...
@@ -62,13 +62,13 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
DeviceMem
in_n_c0_hi_wi_c1_device_buf
(
sizeof
(
TInWei
)
*
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
bias_k0_k1_device_buf
.
ToDevice
(
bias_k0_k1
.
mData
.
data
());
add_n_k0_hox2_wox2_k1_device_buf
.
ToDevice
(
add_n_k0_hox2_wox2_k1
.
mData
.
data
());
add_n_k0_hox2_wox2_k1_device_buf
.
ToDevice
(
add_n_k0_hox2_wox2_k1
.
mData
.
data
());
constexpr
index_t
InWeiVectorSize
=
8
;
constexpr
index_t
InWeiVectorSize
=
8
;
...
@@ -187,8 +187,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
...
@@ -187,8 +187,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
nrepeat
);
{
{
...
@@ -214,10 +214,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
...
@@ -214,10 +214,9 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
wei_k_c0_y_x_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
0
);
0
);
add_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
add_n_k0_hox2_wox2_k1_out
.
mData
.
data
());
add_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
add_n_k0_hox2_wox2_k1_out
.
mData
.
data
());
out_n_k0_ho_wo_k1_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
}
}
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
3e298e42
...
@@ -48,8 +48,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -48,8 +48,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const
InRightPads
&
in_right_pads
,
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_d_grid
,
FloatC
*
__restrict__
p_d_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
int
nrepeat
)
const
const
int
nrepeat
)
const
{
{
using
namespace
ck
;
using
namespace
ck
;
...
@@ -279,7 +279,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -279,7 +279,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
FloatAB
,
FloatAB
,
FloatAcc
,
FloatAcc
,
FloatC
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Add
,
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
d_k_n_hopx2_wopx2_grid_desc
),
decltype
(
d_k_n_hopx2_wopx2_grid_desc
),
...
@@ -366,6 +366,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -366,6 +366,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -391,6 +392,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -391,6 +392,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
@@ -436,6 +438,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -436,6 +438,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
@@ -467,6 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -467,6 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0
,
0
,
p_a_grid
,
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
...
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
View file @
3e298e42
...
@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
...
@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif
0
#elif
1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
...
@@ -182,7 +182,7 @@ int main(int argc, char* argv[])
...
@@ -182,7 +182,7 @@ int main(int argc, char* argv[])
#endif
#endif
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
5
),
wei_lengths_host
(
5
),
out_lengths_host
(
5
),
add_lengths_host
(
5
);
add_lengths_host
(
5
)
,
bias_lengths_host
(
2
)
;
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C0
);
...
@@ -208,18 +208,21 @@ int main(int argc, char* argv[])
...
@@ -208,18 +208,21 @@ int main(int argc, char* argv[])
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wox2
);
add_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
add_lengths_host
[
4
]
=
static_cast
<
std
::
size_t
>
(
K1
);
bias_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
K0
);
bias_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K1
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
in_data_t
>
add_device
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_device
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_host
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
Tensor
<
out_data_t
>
out_host
(
out_lengths_host
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
in
.
mDesc
,
std
::
cout
<<
"in: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
wei
.
mDesc
,
std
::
cout
<<
"wei: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
ostream_HostTensorDescriptor
(
add
.
mDesc
,
std
::
cout
<<
"add: "
);
ostream_HostTensorDescriptor
(
out_host
.
mDesc
,
std
::
cout
<<
"out: "
);
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InLeftPads"
,
make_tuple
(
in_left_pad_h
,
in_left_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
print_array
(
"InRightPads"
,
make_tuple
(
in_right_pad_h
,
in_right_pad_w
));
...
@@ -262,6 +265,7 @@ int main(int argc, char* argv[])
...
@@ -262,6 +265,7 @@ int main(int argc, char* argv[])
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
}
bias
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
add
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
add
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
auto
f_make_for_device_nchwc
=
[
&
]()
{
auto
f_make_for_device_nchwc
=
[
&
]()
{
...
@@ -303,9 +307,9 @@ int main(int argc, char* argv[])
...
@@ -303,9 +307,9 @@ int main(int argc, char* argv[])
tmp
[
I7
],
// in_right_pads_dev
tmp
[
I7
],
// in_right_pads_dev
in
,
in
,
wei
,
wei
,
bias
,
add
,
add
,
add_device
,
add_device
,
out_device
,
nrepeat
);
nrepeat
);
}
}
#endif
#endif
...
@@ -315,6 +319,7 @@ int main(int argc, char* argv[])
...
@@ -315,6 +319,7 @@ int main(int argc, char* argv[])
host_direct_convolution_add_nchwc
(
in
,
host_direct_convolution_add_nchwc
(
in
,
wei
,
wei
,
add
,
add
,
bias
,
add_host
,
add_host
,
out_host
,
out_host
,
make_tuple
(
conv_stride_h
,
conv_stride_w
),
make_tuple
(
conv_stride_h
,
conv_stride_w
),
...
@@ -329,7 +334,6 @@ int main(int argc, char* argv[])
...
@@ -329,7 +334,6 @@ int main(int argc, char* argv[])
{
{
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
// std::endl;
// std::endl;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_host: "
,
add_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"add_host: "
,
add_host
.
mData
,
","
)
<<
std
::
endl
;
...
...
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
View file @
3e298e42
...
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
...
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{};
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif
0
#elif
1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Hi
=
Number
<
540
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
constexpr
auto
Wi
=
Number
<
960
>
{};
...
...
host/host_tensor/include/host_conv.hpp
View file @
3e298e42
...
@@ -167,6 +167,7 @@ template <typename TIn,
...
@@ -167,6 +167,7 @@ template <typename TIn,
void
host_direct_convolution_add_nchwc
(
const
Tensor
<
TIn
>&
in
,
void
host_direct_convolution_add_nchwc
(
const
Tensor
<
TIn
>&
in
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TWei
>&
wei
,
const
Tensor
<
TOut
>&
add
,
const
Tensor
<
TOut
>&
add
,
const
Tensor
<
TOut
>&
bias
,
Tensor
<
TOut
>&
add_host
,
Tensor
<
TOut
>&
add_host
,
Tensor
<
TOut
>&
out_host
,
Tensor
<
TOut
>&
out_host
,
const
ConvStrides
&
conv_strides
,
const
ConvStrides
&
conv_strides
,
...
@@ -204,7 +205,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
...
@@ -204,7 +205,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
}
}
}
}
v
=
activ
(
v
,
activ_type
);
v
=
activ
(
v
,
activ_type
)
+
bias
(
k0
,
k1
)
;
const
int
hox2
=
ho
*
2
;
const
int
hox2
=
ho
*
2
;
const
int
wox2
=
wo
*
2
;
const
int
wox2
=
wo
*
2
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment