Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e8f639d2
Commit
e8f639d2
authored
Jul 02, 2022
by
carlushuang
Browse files
fix a bug when buffer is larger than 4G
parent
3e2a530f
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
210 additions
and
220 deletions
+210
-220
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
+15
-19
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
...conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
+15
-19
include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
...device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
...tion/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp
...on/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp
...tion/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp
...ce_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxck8_nhwk.hpp
..._convnd_fwd_bias_activation_add_avx2_nhwc_kyxck8_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_yxck_nhwk.hpp
...ce_convnd_fwd_bias_activation_add_avx2_nhwc_yxck_nhwk.hpp
+2
-2
include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
.../threadwise_tensor_slice_transfer_avx2_specialization.hpp
+161
-159
library/src/host_tensor/device.cpp
library/src/host_tensor/device.cpp
+5
-9
No files found.
example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
View file @
e8f639d2
...
...
@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
template
<
typename
T
>
static
bool
check_out
(
const
T
ensor
<
T
>&
ref
,
const
T
ensor
<
T
>&
result
,
double
nrms
,
int
per_pixel_check
=
0
)
check_out
(
const
T
*
ref
,
const
T
*
result
,
std
::
size_t
len
,
double
nrms
,
int
per_pixel_check
=
0
)
{
in
t
error_count
=
0
;
std
::
size_
t
error_count
=
0
;
float
max_diff
=
1e-5
;
double
square_difference
=
.0
;
double
mag1
=
.0
;
double
mag2
=
.0
;
for
(
in
t
i
=
0
;
i
<
ref
.
mData
.
size
()
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
len
;
++
i
)
{
double
ri
=
(
double
)
ref
.
mData
[
i
];
double
pi
=
(
double
)
result
.
mData
[
i
];
double
ri
=
(
double
)
ref
[
i
];
double
pi
=
(
double
)
result
[
i
];
double
d
=
ri
-
pi
;
if
(
per_pixel_check
)
...
...
@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if
(
max_diff
<
std
::
abs
(
d
))
{
error_count
++
;
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
i
,
double
(
ref
.
mData
[
i
]),
double
(
result
.
mData
[
i
]),
d
);
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
i
,
double
(
ref
[
i
]),
double
(
result
[
i
]),
d
);
}
}
...
...
@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
}
double
mag
=
std
::
max
({
std
::
fabs
(
mag1
),
std
::
fabs
(
mag2
),
std
::
numeric_limits
<
double
>::
min
()});
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
ref
.
mData
.
size
()
)
*
mag
);
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
len
)
*
mag
);
if
(
computed_nrms
>=
nrms
)
printf
(
"nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f
\n
"
,
...
...
@@ -360,7 +357,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
// TODO: This is only to hold data
#endif
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
std
::
cout
<<
"in (N, C, Hi, Wi): "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei(K, C, Y, X): "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
...
...
@@ -651,10 +647,10 @@ int main(int argc, char* argv[])
double
gflops
=
(
total_flop
*
1e-6
)
/
time
;
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
if
(
cpu_validation
&&
!
check_out
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_
device
_result
,
if
(
cpu_validation
&&
!
check_out
(
out_n_k_ho_wo_host_result
.
mData
.
data
(),
reinterpret_cast
<
OutDataType
*>
(
out_device_buf
.
mpDeviceBuf
)
,
out_n_k_ho_wo_
host
_result
.
mData
.
size
()
,
1e-6
,
per_pixel_check
))
{
...
...
example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
View file @
e8f639d2
...
...
@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add;
template
<
typename
T
>
static
bool
check_out
(
const
T
ensor
<
T
>&
ref
,
const
T
ensor
<
T
>&
result
,
double
nrms
,
int
per_pixel_check
=
0
)
check_out
(
const
T
*
ref
,
const
T
*
result
,
std
::
size_t
len
,
double
nrms
,
int
per_pixel_check
=
0
)
{
in
t
error_count
=
0
;
std
::
size_
t
error_count
=
0
;
float
max_diff
=
1e-5
;
double
square_difference
=
.0
;
double
mag1
=
.0
;
double
mag2
=
.0
;
for
(
in
t
i
=
0
;
i
<
ref
.
mData
.
size
()
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
len
;
++
i
)
{
double
ri
=
(
double
)
ref
.
mData
[
i
];
double
pi
=
(
double
)
result
.
mData
[
i
];
double
ri
=
(
double
)
ref
[
i
];
double
pi
=
(
double
)
result
[
i
];
double
d
=
ri
-
pi
;
if
(
per_pixel_check
)
...
...
@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if
(
max_diff
<
std
::
abs
(
d
))
{
error_count
++
;
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
i
,
double
(
ref
.
mData
[
i
]),
double
(
result
.
mData
[
i
]),
d
);
printf
(
"idx:%3d, ref:%f, res:%f (diff:%f)
\n
"
,
i
,
double
(
ref
[
i
]),
double
(
result
[
i
]),
d
);
}
}
...
...
@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
}
double
mag
=
std
::
max
({
std
::
fabs
(
mag1
),
std
::
fabs
(
mag2
),
std
::
numeric_limits
<
double
>::
min
()});
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
ref
.
mData
.
size
()
)
*
mag
);
double
computed_nrms
=
std
::
sqrt
(
square_difference
)
/
(
std
::
sqrt
(
len
)
*
mag
);
if
(
computed_nrms
>=
nrms
)
printf
(
"nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f
\n
"
,
...
...
@@ -407,7 +404,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
// TODO: This is only to hold data
#endif
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias
(
...
...
@@ -788,10 +784,10 @@ int main(int argc, char* argv[])
double
gflops
=
(
total_flop
*
1e-6
)
/
time
;
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
if
(
cpu_validation
&&
!
check_out
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_
device
_result
,
if
(
cpu_validation
&&
!
check_out
(
out_n_k_ho_wo_host_result
.
mData
.
data
(),
reinterpret_cast
<
OutDataType
*>
(
out_device_buf
.
mpDeviceBuf
)
,
out_n_k_ho_wo_
host
_result
.
mData
.
size
()
,
1e-6
,
per_pixel_check
))
{
...
...
include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
View file @
e8f639d2
...
...
@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_direct_conv_nhwc_avx_mxn
<
GridwiseGemm
,
InDataType
,
...
...
@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0xfe
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
View file @
e8f639d2
...
...
@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_avx_mxn
<
GridwiseGemm
,
InDataType
,
...
...
@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp
View file @
e8f639d2
...
...
@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_avx_mxn
<
GridwiseGemm
,
InDataType
,
...
...
@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp
View file @
e8f639d2
...
...
@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_avx_mxn
<
GridwiseGemm
,
InDataType
,
...
...
@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp
View file @
e8f639d2
...
...
@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_bias_activation_add_avx_mxn
<
GridwiseGemm
,
...
...
@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxck8_nhwk.hpp
View file @
e8f639d2
...
...
@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_bias_activation_add_avx_mxn
<
GridwiseGemm
,
...
...
@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_yxck_nhwk.hpp
View file @
e8f639d2
...
...
@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
throw
std
::
runtime_error
(
"wrong! GridwiseGemmAvx2_MxN has invalid setting"
);
}
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const
auto
kernel
=
ck
::
cpu
::
kernel_gemm_bias_activation_add_avx_mxn
<
GridwiseGemm
,
...
...
@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_
.
GetElementSpaceSize
());
//
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel
(
kernel
,
gridwise_gemm
,
...
...
include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
View file @
e8f639d2
...
...
@@ -18,10 +18,13 @@ namespace cpu {
namespace
avx2_util
{
template
<
typename
ElementwiseOp
>
void
memcpy32_avx2
(
void
*
dst
,
const
void
*
src
,
const
ck
::
index_t
n
,
const
ElementwiseOp
&
element_op
)
void
memcpy32_avx2
(
void
*
dst
,
const
void
*
src
,
const
ck
::
long_index_t
n
,
const
ElementwiseOp
&
element_op
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src
);
while
(
i_n
>=
16
)
...
...
@@ -67,11 +70,11 @@ void memcpy32_avx2_with_extra_2src(void* dst,
const
void
*
src
,
const
void
*
src1
,
const
void
*
src2
,
const
ck
::
index_t
n
,
const
ck
::
long_
index_t
n
,
const
ElementwiseOp
&
element_op
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src
);
const
float
*
p_src1
=
reinterpret_cast
<
const
float
*>
(
src1
);
...
...
@@ -146,11 +149,11 @@ void memcpy32_avx2_with_extra_2src(void* dst,
const
void
*
src
,
float
v_src1
,
const
void
*
src2
,
const
ck
::
index_t
n
,
const
ck
::
long_
index_t
n
,
const
ElementwiseOp
&
element_op
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src
);
const
float
*
p_src2
=
reinterpret_cast
<
const
float
*>
(
src2
);
...
...
@@ -214,11 +217,11 @@ template <typename ElementwiseOp>
void
memcpy32_avx2_with_extra_1src
(
void
*
dst
,
const
void
*
src
,
const
void
*
src_aux
,
const
ck
::
index_t
n
,
const
ck
::
long_
index_t
n
,
const
ElementwiseOp
&
element_op
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src
);
const
float
*
p_src_aux
=
reinterpret_cast
<
const
float
*>
(
src_aux
);
...
...
@@ -277,11 +280,11 @@ template <typename ElementwiseOp>
void
memcpy32_avx2_with_extra_1src
(
void
*
dst
,
const
void
*
src
,
const
float
v_src_aux
,
const
ck
::
index_t
n
,
const
ck
::
long_
index_t
n
,
const
ElementwiseOp
&
element_op
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src
);
...
...
@@ -320,10 +323,10 @@ void memcpy32_avx2_with_extra_1src(void* dst,
}
}
inline
void
memset32_avx2
(
void
*
dst
,
const
int32_t
value
,
const
ck
::
index_t
n
)
inline
void
memset32_avx2
(
void
*
dst
,
const
int32_t
value
,
const
ck
::
long_
index_t
n
)
{
// 16-8-4-2-1 pattern
ck
::
index_t
i_n
=
n
;
ck
::
long_
index_t
i_n
=
n
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst
);
__m256
ymm
=
_mm256_set1_ps
(
*
reinterpret_cast
<
const
float
*>
(
&
value
));
__m128
xmm
=
_mm_set1_ps
(
*
reinterpret_cast
<
const
float
*>
(
&
value
));
...
...
@@ -361,9 +364,9 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n)
template
<
typename
ElementwiseOp
>
void
transpose8x8_avx2
(
void
*
dst
,
ck
::
index_t
stride_dst
,
ck
::
long_
index_t
stride_dst
,
const
void
*
src
,
ck
::
index_t
stride_src
,
ck
::
long_
index_t
stride_src
,
const
ElementwiseOp
&
element_op
)
{
// TODO: use vinsertf128 for better port usage. vpermf128 is slow
...
...
@@ -560,8 +563,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
void
SetSrcSliceOrigin
(
const
SrcDesc
&
,
const
Index
&
src_slice_origin_idx
)
{
ck
::
index
_t
idx_m
=
src_slice_origin_idx
[
Number
<
0
>
{}];
ck
::
index
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
intptr
_t
idx_m
=
src_slice_origin_idx
[
Number
<
0
>
{}];
intptr
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization_t
::
Filter1x1Stride1Pad0
)
...
...
@@ -640,19 +643,19 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
}
else
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
k_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
k_per_block
=
slice_length
[
Number
<
1
>
{}];
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src_buf
.
p_data_
)
+
src_offset
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
);
// printf("src offset:%
d
, k_per_block:%d, m_per_block:%d\n", src_offset, k_per_block,
// m_per_block);
// printf("src offset:%
llu
, k_per_block:%d, m_per_block:%d\n", src_offset, k_per_block,
//
m_per_block);
fflush(stdout);
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization_t
::
Filter1x1Stride1Pad0
)
{
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// standard 8-4-2-1 pattern
while
(
i_m_itr
>=
8
)
{
...
...
@@ -712,9 +715,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
else
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization_t
::
Filter1x1Pad0
)
{
ck
::
index
_t
i_m_itr
=
m_per_block
;
ck
::
index
_t
i_wo_itr
=
i_wo
;
ck
::
index
_t
i_ho_itr
=
i_ho
;
intptr
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_wo_itr
=
i_wo
;
intptr
_t
i_ho_itr
=
i_ho
;
while
(
i_m_itr
>
0
)
{
avx2_util
::
memcpy32_avx2
(
p_dst
,
p_src
,
k_per_block
,
element_op_
);
...
...
@@ -743,11 +746,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
if
(
gemm_k_spec_
==
ConvolutionForwardGemmKSpecialization_t
::
NHWC_GemmKLoopOverC
)
{
// c % k_per_block == 0, so every time k_per_block here is the same
ck
::
index
_t
i_m_itr
=
m_per_block
;
ck
::
index
_t
i_wo_itr
=
i_wo
;
ck
::
index
_t
i_ho_itr
=
i_ho
;
ck
::
index
_t
i_wi_itr
=
i_wi
;
ck
::
index
_t
i_hi_itr
=
i_hi
;
intptr
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_wo_itr
=
i_wo
;
intptr
_t
i_ho_itr
=
i_ho
;
intptr
_t
i_wi_itr
=
i_wi
;
intptr
_t
i_hi_itr
=
i_hi
;
while
(
i_m_itr
>
0
)
{
...
...
@@ -785,11 +788,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
}
else
{
ck
::
index
_t
i_m_itr
=
m_per_block
;
ck
::
index
_t
i_wo_itr
=
i_wo
;
ck
::
index
_t
i_ho_itr
=
i_ho
;
ck
::
index
_t
i_wi_itr
=
i_wi
;
ck
::
index
_t
i_hi_itr
=
i_hi
;
intptr
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_wo_itr
=
i_wo
;
intptr
_t
i_ho_itr
=
i_ho
;
intptr
_t
i_wi_itr
=
i_wi
;
intptr
_t
i_hi_itr
=
i_hi
;
// ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
// iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
while
(
i_m_itr
>
0
)
...
...
@@ -797,16 +800,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
/*** go along Gemm K ***/
const
float
*
p_src_k
=
p_src
;
float
*
p_dst_k
=
p_dst
;
ck
::
index
_t
i_wi_itr_k
=
i_wi_itr
;
ck
::
index
_t
i_hi_itr_k
=
i_hi_itr
;
ck
::
index
_t
i_c_itr_k
=
i_c
;
//
ck::index
_t i_y_itr_k = i_y;
ck
::
index
_t
i_x_itr_k
=
i_x
;
intptr
_t
i_wi_itr_k
=
i_wi_itr
;
intptr
_t
i_hi_itr_k
=
i_hi_itr
;
intptr
_t
i_c_itr_k
=
i_c
;
//
intptr
_t i_y_itr_k = i_y;
intptr
_t
i_x_itr_k
=
i_x
;
ck
::
index
_t
i_k_itr
=
k_per_block
;
intptr
_t
i_k_itr
=
k_per_block
;
while
(
i_k_itr
>
0
)
{
ck
::
index
_t
current_k_block_along_c
=
intptr
_t
current_k_block_along_c
=
ck
::
math
::
min
(
C
-
i_c_itr_k
,
i_k_itr
);
// printf("current_k_block_along_c:%d, i_c_itr_k:%d, k_per_block:%d\n",
...
...
@@ -875,7 +878,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
ck
::
index
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
intptr
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization_t
::
Filter1x1Stride1Pad0
)
{
...
...
@@ -937,35 +940,35 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
const
ElementwiseOperation
element_op_
;
const
ConvolutionForwardGemmKSpecialization_t
gemm_k_spec_
;
ck
::
index
_t
i_n
;
ck
::
index
_t
i_c
;
ck
::
index
_t
i_hi
;
ck
::
index
_t
i_wi
;
ck
::
index
_t
i_ho
;
ck
::
index
_t
i_wo
;
ck
::
index
_t
i_y
;
ck
::
index
_t
i_x
;
ck
::
index
_t
i_gemm_k
;
intptr
_t
i_n
;
intptr
_t
i_c
;
intptr
_t
i_hi
;
intptr
_t
i_wi
;
intptr
_t
i_ho
;
intptr
_t
i_wo
;
intptr
_t
i_y
;
intptr
_t
i_x
;
intptr
_t
i_gemm_k
;
ck
::
index
_t
N
;
//
ck::index
_t K;
ck
::
index
_t
C
;
ck
::
index
_t
Hi
;
ck
::
index
_t
Wi
;
ck
::
index
_t
Ho
;
ck
::
index
_t
Wo
;
intptr
_t
N
;
//
intptr
_t K;
intptr
_t
C
;
intptr
_t
Hi
;
intptr
_t
Wi
;
intptr
_t
Ho
;
intptr
_t
Wo
;
ck
::
index
_t
Sy
;
ck
::
index
_t
Sx
;
intptr
_t
Sy
;
intptr
_t
Sx
;
ck
::
index
_t
Dy
;
ck
::
index
_t
Dx
;
intptr
_t
Dy
;
intptr
_t
Dx
;
ck
::
index
_t
Py
;
ck
::
index
_t
Px
;
intptr
_t
Py
;
intptr
_t
Px
;
ck
::
index
_t
Fy
;
ck
::
index
_t
Fx
;
intptr
_t
Fy
;
intptr
_t
Fx
;
intptr_t
input_offset_acc_wi
;
intptr_t
input_offset_ovf_wi_acc_hi
;
...
...
@@ -1008,9 +1011,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
void
SetSrcSliceOrigin
(
const
SrcDesc
&
,
const
Index
&
src_slice_origin_idx
)
{
ck
::
index
_t
idx_n0
=
src_slice_origin_idx
[
Number
<
0
>
{}];
ck
::
index
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
ck
::
index
_t
idx_n1
=
src_slice_origin_idx
[
Number
<
2
>
{}];
intptr
_t
idx_n0
=
src_slice_origin_idx
[
Number
<
0
>
{}];
intptr
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
intptr
_t
idx_n1
=
src_slice_origin_idx
[
Number
<
2
>
{}];
i_gemm_n
=
idx_n0
*
GemmN1
+
idx_n1
;
// i_gemm_k = idx_k;
...
...
@@ -1037,8 +1040,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
}
else
{
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
0
>
{}]
*
slice_length
[
Number
<
2
>
{}];
const
ck
::
index
_t
k_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
0
>
{}]
*
slice_length
[
Number
<
2
>
{}];
const
intptr
_t
k_per_block
=
slice_length
[
Number
<
1
>
{}];
// printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block,
// dst_desc.GetTransforms()[Number<0>{}]
...
...
@@ -1053,8 +1056,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
// n * k -> n0 * k * n1, n1 = 8, n0 = n/8
for
(
index_t
i_n_itr
=
0
;
i_n_itr
<
n_per_block
;
i_n_itr
+=
8
)
{
ck
::
index
_t
current_n_8
=
ck
::
math
::
min
(
GemmN
-
(
i_n_itr
+
i_gemm_n
),
8
);
ck
::
index
_t
i_k_itr
=
k_per_block
;
intptr
_t
current_n_8
=
ck
::
math
::
min
(
GemmN
-
(
i_n_itr
+
i_gemm_n
),
(
intptr_t
)
8
);
intptr
_t
i_k_itr
=
k_per_block
;
if
(
current_n_8
==
8
)
{
const
float
*
p_src_k
=
p_src
;
...
...
@@ -1151,7 +1154,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
{
for
(
index_t
i_sub_k
=
0
;
i_sub_k
<
k_per_block
;
i_sub_k
++
)
{
ck
::
index
_t
i_current_n_itr
=
i_n_itr
+
i_sub_n
+
i_gemm_n
;
intptr
_t
i_current_n_itr
=
i_n_itr
+
i_sub_n
+
i_gemm_n
;
float
v
=
i_current_n_itr
<
GemmN
?
element_op_
.
Apply
(
p_src_k
[
i_sub_n
*
GemmK
+
i_sub_k
])
...
...
@@ -1171,8 +1174,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
ck
::
index
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
ck
::
index
_t
move_n0
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
intptr
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
intptr
_t
move_n0
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
// i_gemm_k += move_k;
...
...
@@ -1187,13 +1190,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
i_gemm_n
;
//
ck::index
_t i_gemm_k;
intptr
_t
i_gemm_n
;
//
intptr
_t i_gemm_k;
//
ck::index
_t GemmN0;
ck
::
index
_t
GemmN1
;
ck
::
index
_t
GemmN
;
ck
::
index
_t
GemmK
;
//
intptr
_t GemmN0;
intptr
_t
GemmN1
;
intptr
_t
GemmN
;
intptr
_t
GemmK
;
intptr_t
src_offset
;
};
...
...
@@ -1226,9 +1229,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
void
SetSrcSliceOrigin
(
const
SrcDesc
&
,
const
Index
&
src_slice_origin_idx
)
{
ck
::
index
_t
idx_n0
=
src_slice_origin_idx
[
Number
<
0
>
{}];
ck
::
index
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
ck
::
index
_t
idx_n1
=
src_slice_origin_idx
[
Number
<
2
>
{}];
intptr
_t
idx_n0
=
src_slice_origin_idx
[
Number
<
0
>
{}];
intptr
_t
idx_k
=
src_slice_origin_idx
[
Number
<
1
>
{}];
intptr
_t
idx_n1
=
src_slice_origin_idx
[
Number
<
2
>
{}];
src_offset
=
idx_n0
*
GemmK
*
GemmN1
+
idx_k
*
GemmN1
+
idx_n1
;
...
...
@@ -1251,10 +1254,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
}
else
{
const
ck
::
index_t
n0_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index_t
k_n1_per_block
=
slice_length
[
Number
<
1
>
{}]
*
slice_length
[
Number
<
2
>
{}];
const
ck
::
index_t
SrcStride_K_N1
=
GemmK
*
slice_length
[
Number
<
2
>
{}];
const
intptr_t
n0_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr_t
k_n1_per_block
=
slice_length
[
Number
<
1
>
{}]
*
slice_length
[
Number
<
2
>
{}];
const
intptr_t
SrcStride_K_N1
=
GemmK
*
slice_length
[
Number
<
2
>
{}];
// printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block,
// dst_desc.GetTransforms()[Number<0>{}]
...
...
@@ -1356,9 +1358,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
ck
::
index
_t
move_n0
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
ck
::
index
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
ck
::
index
_t
move_n1
=
src_slice_origin_step_idx
[
Number
<
2
>
{}];
intptr
_t
move_n0
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
intptr
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
intptr
_t
move_n1
=
src_slice_origin_step_idx
[
Number
<
2
>
{}];
// i_gemm_k += move_k;
...
...
@@ -1373,13 +1375,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
i_gemm_n
;
//
ck::index
_t i_gemm_k;
intptr
_t
i_gemm_n
;
//
intptr
_t i_gemm_k;
//
ck::index
_t GemmN0;
ck
::
index
_t
GemmN1
;
ck
::
index
_t
GemmN
;
ck
::
index
_t
GemmK
;
//
intptr
_t GemmN0;
intptr
_t
GemmN1
;
intptr
_t
GemmN
;
intptr
_t
GemmK
;
intptr_t
src_offset
;
};
...
...
@@ -1410,8 +1412,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
void
SetSrcSliceOrigin
(
const
SrcDesc
&
,
const
Index
&
src_slice_origin_idx
)
{
ck
::
index
_t
idx_k
=
src_slice_origin_idx
[
Number
<
0
>
{}];
ck
::
index
_t
idx_n
=
src_slice_origin_idx
[
Number
<
1
>
{}];
intptr
_t
idx_k
=
src_slice_origin_idx
[
Number
<
0
>
{}];
intptr
_t
idx_n
=
src_slice_origin_idx
[
Number
<
1
>
{}];
src_offset
=
idx_k
*
GemmN
+
idx_n
;
}
...
...
@@ -1431,8 +1433,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
}
else
{
const
ck
::
index
_t
k_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
k_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src_buf
.
p_data_
)
+
src_offset
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
);
...
...
@@ -1497,8 +1499,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
ck
::
index
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
ck
::
index
_t
move_n
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
intptr
_t
move_k
=
src_slice_origin_step_idx
[
Number
<
0
>
{}];
intptr
_t
move_n
=
src_slice_origin_step_idx
[
Number
<
1
>
{}];
src_offset
+=
move_k
*
GemmN
+
move_n
;
}
...
...
@@ -1509,8 +1511,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
GemmN
;
ck
::
index
_t
GemmK
;
intptr
_t
GemmN
;
intptr
_t
GemmK
;
intptr_t
src_offset
;
};
...
...
@@ -1587,14 +1589,14 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
if
constexpr
(
!
std
::
is_same
<
ElementwiseOperation
,
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
>::
value
)
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d,
// dst_offset:%d\n",__LINE__, current_n,
...
...
@@ -1657,15 +1659,15 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
}
else
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
float
*
p_src
=
reinterpret_cast
<
float
*>
(
src_buf
.
p_data_
)
+
src_offset
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout);
...
...
@@ -1740,11 +1742,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
i_dst_gemm_m
;
ck
::
index
_t
i_dst_gemm_n
;
intptr
_t
i_dst_gemm_m
;
intptr
_t
i_dst_gemm_n
;
ck
::
index
_t
DstGemmM
;
ck
::
index
_t
DstGemmN
;
intptr
_t
DstGemmM
;
intptr
_t
DstGemmN
;
intptr_t
src_offset
;
intptr_t
dst_offset
;
...
...
@@ -1868,10 +1870,10 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
if
constexpr
(
!
std
::
is_same
<
ElementwiseOperation
,
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
>::
value
)
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
const
float
*
p_src1
=
...
...
@@ -1879,7 +1881,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
const
float
*
p_src2
=
reinterpret_cast
<
const
float
*>
(
src2_buf
.
p_data_
)
+
src2_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d,
// dst_offset:%d\n",__LINE__, current_n,
...
...
@@ -2129,17 +2131,17 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
}
else
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src_buf
.
p_data_
)
+
src_offset
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
const
float
*
p_src1
=
reinterpret_cast
<
const
float
*>
(
src1_buf
.
p_data_
)
+
src1_offset
;
const
float
*
p_src2
=
reinterpret_cast
<
const
float
*>
(
src2_buf
.
p_data_
)
+
src2_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout);
...
...
@@ -2404,11 +2406,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
i_dst_gemm_m
;
ck
::
index
_t
i_dst_gemm_n
;
intptr
_t
i_dst_gemm_m
;
intptr
_t
i_dst_gemm_n
;
ck
::
index
_t
DstGemmM
;
ck
::
index
_t
DstGemmN
;
intptr
_t
DstGemmM
;
intptr
_t
DstGemmN
;
intptr_t
src_offset
;
intptr_t
src1_offset
;
...
...
@@ -2526,16 +2528,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
if
constexpr
(
!
std
::
is_same
<
ElementwiseOperation
,
ck
::
tensor_operation
::
cpu
::
element_wise
::
PassThrough
>::
value
)
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
const
float
*
p_src1
=
reinterpret_cast
<
const
float
*>
(
src1_buf
.
p_data_
)
+
src1_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// standard 8-4-2-1 pattern
if
constexpr
(
Src1AlongDim0
)
...
...
@@ -2745,16 +2747,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
}
else
{
const
ck
::
index
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
ck
::
index
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
intptr
_t
m_per_block
=
slice_length
[
Number
<
0
>
{}];
const
intptr
_t
n_per_block
=
slice_length
[
Number
<
1
>
{}];
const
ck
::
index
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
intptr
_t
current_n
=
ck
::
math
::
min
(
DstGemmN
-
i_dst_gemm_n
,
n_per_block
);
const
float
*
p_src
=
reinterpret_cast
<
const
float
*>
(
src_buf
.
p_data_
)
+
src_offset
;
float
*
p_dst
=
reinterpret_cast
<
float
*>
(
dst_buf
.
p_data_
)
+
dst_offset
;
const
float
*
p_src1
=
reinterpret_cast
<
const
float
*>
(
src1_buf
.
p_data_
)
+
src1_offset
;
ck
::
index
_t
i_m_itr
=
m_per_block
;
intptr
_t
i_m_itr
=
m_per_block
;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout);
...
...
@@ -2981,11 +2983,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
private:
const
ElementwiseOperation
element_op_
;
ck
::
index
_t
i_dst_gemm_m
;
ck
::
index
_t
i_dst_gemm_n
;
intptr
_t
i_dst_gemm_m
;
intptr
_t
i_dst_gemm_n
;
ck
::
index
_t
DstGemmM
;
ck
::
index
_t
DstGemmN
;
intptr
_t
DstGemmM
;
intptr
_t
DstGemmN
;
intptr_t
src_offset
;
intptr_t
src1_offset
;
...
...
library/src/host_tensor/device.cpp
View file @
e8f639d2
#include <chrono>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include "device.hpp"
#ifndef CK_NOGPU
...
...
@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align
{
assert
(
!
(
alignment
==
0
||
(
alignment
&
(
alignment
-
1
))));
// check pow of 2
void
*
p1
;
void
**
p2
;
int
offset
=
alignment
-
1
+
sizeof
(
void
*
);
p1
=
malloc
(
mem_size
+
offset
);
assert
(
p1
!=
nullptr
);
// TODO: posix only
int
rtn
=
posix_memalign
(
&
mpDeviceBuf
,
alignment
,
mem_size
);
p2
=
reinterpret_cast
<
void
**>
((
reinterpret_cast
<
size_t
>
(
p1
)
+
offset
)
&
~
(
alignment
-
1
));
p2
[
-
1
]
=
p1
;
mpDeviceBuf
=
reinterpret_cast
<
void
*>
(
p2
);
assert
(
rtn
==
0
);
}
}
...
...
@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU
::~
DeviceAlignedMemCPU
()
{
if
(
mpDeviceBuf
!=
nullptr
)
free
(
(
reinterpret_cast
<
void
**>
(
mpDeviceBuf
)
)[
-
1
])
;
free
(
mpDeviceBuf
);
}
struct
WallTimerImpl
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment